Monday, December 25, 2023

爬虫从入门到入土

 

1. 爬虫的本质

爬虫的本质是通过程序自动化获取数据,加快数据的采集效率。

2. 爬虫的手段

现在的网站针对爬虫手段往往采取了反爬虫策略,因此有一种方案是使用浏览器驱动的爬虫,此种方案的坏处在于要通过驱动加载浏览器,从而效率是降低了,好处是理论上任何网站都可以爬取到数据。另一种则是分析前端 JS, 采取各种综合手段分析数据库接口,爬取数据。

对于那些反爬比较宽松的网站,我们可以获取目标数据的 URL, 对于支持 cookie 的网站,只要请求的请求头部带上 cookie 即可。

3. 实战

3.1 环境安装说明

  1. 操作系统配置

操作系统:Windows7 32 位

处理器:Intel Core i7-4790 CPU @3.6GHZ

内存:8G

爬虫系统环境:Python3.6.0 (32 位) + Chrome 浏览器(84.4.4147.125 版本)

2.环境部署

2.1)下载Python

根据操作系统去 Python 官网(https://www.python.org/) 下载对应的 Python3.6.0 版本

将 Python 安装目录添加到系统的 Path 环境变量

下载 Chrome 浏览器(https://www.google.cn/chrome/)并安装

2.3)下载 Chrome 浏览器驱动

下载 Chrome 浏览器对应版本的驱动

查看 Chrome 浏览器版本,

https://chromedriver.storage.googleapis.com/index.html 下载浏览器对应的驱动版本, 驱动文件名为 chromedriver.exe, 并将浏览器驱动放在Python 安装目录或者虚拟环境 目录下的Scripts 子目录下。

3.2 代码

import os
import sys
import json
from time import sleep, time
import datetime
import logging
import logging.config
import yaml
import traceback
import xlwt
from selenium import webdriver
from bs4 import BeautifulSoup

from selenium.common.exceptions import NoSuchElementException

# create logger
log_conf = 'log_conf.yml'
with open(log_conf, 'rt') as f:
config = yaml.safe_load(f.read())

logging.config.dictConfig(config)
logger = logging.getLogger(__name__)


# 打印日志
def printLn(s):
print(s)
if isinstance(s, str):
logger.info(s)
elif isinstance(s, list):
if isinstance(s[0], str):
logger.info(s)
elif isinstance(s, dict):
logger.info(s)
else:
pass


# 替换掉特殊字符
def formatStr(strings):
return strings.replace("\xa0", "")


# 解析案件查询结果页,返回案件信息列表
# text: 案件查询结果页id=listAreaResult-atbody的内嵌html
def parseCaseResult(text):
soup = BeautifulSoup(text, "html.parser")
data_row_els = soup.select(".sl-data-row")
cases_infos = list()
printLn("===========解析案件基本信息页开始===============")
for data_el in data_row_els:
# data_row_tds = data_el.select(".sl-data-row-td > span")
clo = 0
case = dict()
# ==========================================
ah = data_el.select("span[colid='columnStringCAH']")[0].text
printLn("================案号为{}信息解析开始======================".format(ah))
printLn("案号为:{}".format(ah))
case['ah'] = ah
ajlb = data_el.select("span[colid='columnCodeNAjlb']")[0].text.split("\n")[1]
printLn("案件类别 为:{}".format(ajlb))
case['ajlb'] = ajlb
spac = formatStr(data_el.select("span[colid='columnCodeNSPCX']")[0].text).split("\n")[1]
printLn("审判程序 为:{}".format(spac))
case['spac'] = spac
# ==========================
dsr = formatStr(data_el.select("span[colid='columnStringCDSR']")[0].text)
printLn("当事人 为:{}".format(dsr))
# 当事人顺序可能不准确,名称可能被省略, 如 "原告:XXX;被告:YYY,ZZZ等" 这个等会有问题
case['dsr'] = dsr
sarq = formatStr(data_el.select("span[colid='columnDateDSARQ']")[0].text)
printLn("收案日期 为:{}".format(sarq))
case['sarq'] = sarq
try:
lar = formatStr(data_el.select("span[colid='columnOrganLAR']")[0].text).split("\n")[1]
except IndexError:
lar = ""
printLn("立案人为:{}".format(lar))
case['lar'] = lar
larq = formatStr(data_el.select("span[colid='columnDateDLARQ']")[0].text)
printLn("立案日期为:{}".format(larq))
case['larq'] = larq
# =============================
try:
laay = formatStr(data_el.select("span[colid='columnStringLAAY']")[0].text).split("\n")[1]
except IndexError:
laay = ""
case['laay'] = laay
printLn("立案案由为:{}".format(laay))
# ===================================
try:
cbspt = formatStr(data_el.select("span[colid='columnOrganCBSPT']")[0].text).split("\n")[1]
except IndexError:
cbspt = ""
case['cbspt'] = cbspt
printLn("承办审判庭为:{}".format(cbspt))
# ================================
try:
cbr = formatStr(data_el.select("span[colid='columnOrganNCBR']")[0].text).split("\n")[1]
except IndexError:
cbr = ""
case['cbr'] = cbr
printLn("承办人为:{}".format(cbr))
try:
Tqbjrq = formatStr(data_el.select("span[colid='columnDateDTqbjrq']")[0].text).split("\n")[0]
except IndexError:
Tqbjrq = ""
printLn("提请报结日期:{}".format(Tqbjrq))
case['Tqbjrq'] = Tqbjrq
jarq = formatStr(data_el.select("span[colid='columnDateDJARQ']")[0].text)
printLn("结案日期为:{}".format(jarq))
case['jarq'] = jarq
# 是否归档
# printLn("")
try:
jaay = formatStr(data_el.select("span[colid='columnStringCAY']")[0].text).split("\n")[1]
except IndexError:
jaay = ""
case['jaay'] = jaay
printLn("结案案由为:{}".format(jaay))
try:
jafs = formatStr(data_el.select("span[colid='columnStringJAFS']")[0].text).split("\n")[1]
except:
jafs = ""
case['jafs'] = jafs
printLn("结案方式为:{}".format(jafs))
# 是否涉自贸区
try:
SFSZMQ = formatStr(data_el.select("span[colid='columnCodeSFSZMQ']")[0].text).split("\n")[1]
except IndexError:
SFSZMQ = ""
printLn("是否涉自贸区:{}".format(SFSZMQ))
case['SFSZMQ'] = SFSZMQ
# columnCodeNPczt
try:
pczt = formatStr(data_el.select("span[colid='columnCodeNPczt']")[0].text).split("\n")[1]
except IndexError:
pczt = ""
printLn("评查状态:{}".format(pczt))
case['pczt'] = pczt
# 经办法院
try:
jingbanfayuan = formatStr(data_el.select("span[colid='columnOrgan6aace']")[0].text).split("\n")[1]
except IndexError:
jingbanfayuan = ""
printLn("经办法院:{}".format(jingbanfayuan))
case['jingbanfayuan'] = jingbanfayuan
# 原审案号
try:
Ysah = formatStr(data_el.select("span[colid='columnStringCYsah']")[0].text)
except:
Ysah = ""
printLn("原审案号:{}".format(Ysah))
case['Ysah'] = Ysah
# 原审法院
try:
YSFY = formatStr(data_el.select("span[colid='columnOrganYSFY']")[0].text)
except:
YSFY = ""
printLn("原审法院:{}".format(YSFY))
case['YSFY'] = YSFY
# 原审结案方式
try:
YSJAFS = formatStr(data_el.select("span[colid='columnCodeYSJAFS']")[0].text)
except:
YSJAFS = ""
printLn("原审结案方式:{}".format(YSJAFS))
case['YSJAFS'] = YSJAFS
# 归档日期
try:
GDRQ = formatStr(data_el.select("span[colid='columnDateGDRQ']")[0].text)
except:
GDRQ = ""
printLn("归档日期:{}".format(GDRQ))
case['GDRQ'] = GDRQ
# 归档人
try:
GDR = formatStr(data_el.select("span[colid='columnStringGDR']")[0].text).split("\n")[1]
except IndexError:
GDR = ""
printLn("归档人:{}".format(GDR))
case['GDR'] = GDR
# 审限情况
try:
SXQK = formatStr(data_el.select("span[colid='columnStringSXQK']")[0].text).split("\n")[1]
except IndexError:
SXQK = ""
printLn("审限情况:{}".format(SXQK))
case['SXQK'] = SXQK
# 是否超审限
try:
SFCSX = formatStr(data_el.select("span[colid='columnCodeSFCSX']")[0].text).split("\n")[1]
except IndexError:
SFCSX = ""
printLn("是否超审限:{}".format(SFCSX))
case['SFCSX'] = SFCSX
# 超审限天数
try:
CSXTS = formatStr(data_el.select("span[colid='columnNumericCSXTS']")[0].text)
except:
CSXTS = ""
printLn("超审限天数:{}".format(CSXTS))
case['CSXTS'] = CSXTS
# 已交费(元)
try:
Jfzje = formatStr(data_el.select("span[colid='columnNumericNJfzje']")[0].text)
except:
Jfzje = ""
printLn("已交费(元):{}".format(Jfzje))
case['Jfzje'] = Jfzje
# 开庭地点
try:
Ktdd = formatStr(data_el.select("span[colid='columnStringKtdd']")[0].text)
except:
Ktdd = ""
printLn("开庭地点:{}".format(Ktdd))
case['Ktdd'] = Ktdd
# 是否归档
ShifouGuidang = "否" if GDR == "" else "是"
printLn("是否归档:{}".format(ShifouGuidang))
case['ShifouGuidang'] = ShifouGuidang
printLn("================案号为{}信息解析结束======================".format(ah))
cases_infos.append(case)
clo += 1
printLn("===========解析案件基本信息页结束===============")
return cases_infos


# 获取当事人名称
def getDangshirenMingcheng(name_list, value_list):
pass


# 获取当事人证件类型、证件号码, name_list: html 页面中class='name', value_list: html 页面中 class='value'
# dsr_type: 当事人类型, 有自然人、非法人组织、法人, 返回当事人证件类型,证件号, 如:('统一社会信用代码', '911300005619733208')
def getDangshirenZhengjianInfo(name_list, value_list, dsr_type):
# res_data = tuple()
if dsr_type is None:
return "", ""
if dsr_type == '自然人':
try:
index1 = name_list.index("证件类型")
leixing = value_list[index1]
leixing = strReplace(leixing)
index2 = name_list.index("身份证件号码")
haoma = value_list[index2]
haoma = strReplace(haoma)
return leixing, haoma
except (ValueError, IndexError):
return "", ""
else:
# 非法人组织或者法人
try:
index1 = name_list.index("证照类型")
leixing = value_list[index1]
leixing = strReplace(leixing)
index2 = name_list.index("证照号码")
haoma = value_list[index2]
haoma = strReplace(haoma)
return leixing, haoma
except (ValueError, IndexError):
return "", ""


# 工具类, 去掉字符串中的空格
def strReplace(s):
return s.replace(" ", "")


# 解析民事当事人页面信息, div: 元素, case: 案件字典
def parseCaseLitigantInfoFormHtml(div, case):
printLn("================解析案号为{}民事当事人页面信息开始================".format(case.get("ah", None)))
printLn(div)
tbody = div.table.tbody
trs = tbody.children
i = 0
# caseLitigantNameYuangaoArr = list() # 原告当事人名称
# caseLitigantNameBeigaoArr = list() # 被告当事人名称
# caseLitigantNameShenqingrenArr = list() # 申请人当事人名称
# caseLitigantNameBeishenqingrenArr = list() # 被申请人当事人名称
# caseLitigantNameDisanrenArr = list() # 第三人当事人名称
caseLitigantNameArr = list() # 当事人诉讼地位:当事人名称
caseLitigantTypeArr = list() # 当事人类型
caseLitigantZhjianLeixingArr = list() # 当事人证件类型
caseLitigantZhjianHaomaArr = list() # 当事人证件号码
caseLitigantDizhiArr = list() # 当事人地址
caseLitigantLianxiFangshiArr = list() # 当事人联系方式
caseLitigantFaDingDaiBiaoArr = list() # 法定代表人
caseLitigantFaDingDaiBiaoZhLeixingArr = list() # 法定代表人证件类型
caseLitigantFaDingDaiBiaoZhjianHaomaArr = list() # 法定代表人证件号
for tr in trs:
if tr.name != "tr":
# 忽略非 tr
continue
if i == 0 or i % 3 != 0:
i += 1
continue
# 特殊情况如 rightDiv2.html
# if tr.td.table:
# tr = tr.td.table.tbody
name_list = list()
value_list = list()
printLn("index====={}".format(i))
td_name_list = tr.select("td[class='name']")
for td_name in td_name_list:
name_list.append(td_name.text)
td_value_list = tr.select("td[class='value']")
for td_value in td_value_list:
value_list.append(td_value.text)
try:
# 删除多余的name
name_list.remove("当事人信息")
name_list.remove("法定代表人")
name_list.remove("诉讼代理人信息")
except ValueError:
pass
printLn("========name_list value_list===========")
printLn(name_list)
printLn(value_list)
printLn("========name_list value_list===========")
dsr_index = name_list.index("本案诉讼地位")
susongdiwei = strReplace(value_list[dsr_index])

caseLitigantName = "" # 当事人名称
try:
dsr_index = name_list.index("当事人名称")
caseLitigantName = strReplace(value_list[dsr_index])
printLn("================添加当事人({})信息开始==================".format(caseLitigantName))
printLn("当事人名称:{}".format(caseLitigantName))
except (ValueError, IndexError):
caseLitigantName = ""
caseLitigantNameArr.append(":".join([susongdiwei, caseLitigantName]))
# if susongdiwei == "原告":
# caseLitigantNameYuangaoArr.append(caseLitigantName)
# elif susongdiwei == "被告":
# caseLitigantNameBeigaoArr.append(caseLitigantName)
# elif susongdiwei == "申请人":
# caseLitigantNameDisanrenArr.append(caseLitigantName)
# else:
# pass

dangshirenTypeTmp = None # 当事人类型临时变量
try:
dsr_index = name_list.index("当事人类型")
dangshirenTypeTmp = value_list[dsr_index]
dangshirenTypeTmp = strReplace(dangshirenTypeTmp) # 去掉空格
caseLitigantTypeArr.append(dangshirenTypeTmp)
printLn("当事人类型:{}".format(dangshirenTypeTmp))
except (ValueError, IndexError):
# 没有类型则为 ""
caseLitigantTypeArr.append("")
# 解析证件类型/证照类型
DangshirenZhengjianInfo = getDangshirenZhengjianInfo(name_list, value_list, dangshirenTypeTmp)

DangshirenZhengjianInfo_leixing = DangshirenZhengjianInfo[0] # 证件类型
DangshirenZhengjianInfo_haoma = DangshirenZhengjianInfo[1] # 证件号码
caseLitigantZhjianLeixingArr.append(DangshirenZhengjianInfo_leixing)
printLn("证件类型:{}".format(DangshirenZhengjianInfo_leixing))
caseLitigantZhjianHaomaArr.append(DangshirenZhengjianInfo_haoma)
printLn("证件号码:{}".format(DangshirenZhengjianInfo_haoma))

try:
dsr_index = name_list.index("地址")
caseLitigantDizhi_ = strReplace(value_list[dsr_index])
caseLitigantDizhiArr.append(caseLitigantDizhi_)
printLn("地址:{}".format(caseLitigantDizhi_))
except (ValueError, IndexError):
# 没有类型则为 ""
caseLitigantDizhiArr.append("")
try:
dsr_index = name_list.index("联系电话(当事人)")
caseLitigantLianxiFangshi_ = strReplace(value_list[dsr_index])
caseLitigantLianxiFangshiArr.append(caseLitigantLianxiFangshi_)
printLn("联系电话(当事人):{}".format(caseLitigantLianxiFangshi_))
except (ValueError, IndexError):
# 没有类型则为 ""
caseLitigantLianxiFangshiArr.append("")

# 特殊情况:处理法定代表人
try:
dsr_index = name_list.index("法定代表人或主要负责人")
caseLitigantFaDingDaiBiao_ = strReplace(value_list[dsr_index])
caseLitigantFaDingDaiBiaoArr.append(caseLitigantFaDingDaiBiao_)
printLn("法定代表人或主要负责人:{}".format(caseLitigantFaDingDaiBiao_))
except (ValueError, IndexError):
# 没有类型则为 ""
caseLitigantFaDingDaiBiaoArr.append("")
# 法定代表人证件类型
try:
dsr_index = name_list.index("代表人证件类型")
caseLitigantFaDingDaiBiaoZhLeixing_ = strReplace(value_list[dsr_index])
caseLitigantFaDingDaiBiaoZhLeixingArr.append(caseLitigantFaDingDaiBiaoZhLeixing_)
printLn("法定代表人证件类型:{}".format(caseLitigantFaDingDaiBiaoZhLeixing_))
except (ValueError, IndexError):
# 没有类型则为 ""
caseLitigantFaDingDaiBiaoZhLeixingArr.append("")
# 法定代表人证件号
try:
dsr_index = name_list.index("代表人证件号码")
caseLitigantFaDingDaiBiaoZhjianHaoma_ = strReplace(value_list[dsr_index])
caseLitigantFaDingDaiBiaoZhjianHaomaArr.append(caseLitigantFaDingDaiBiaoZhjianHaoma_)
printLn("法定代表人证件号:{}".format(caseLitigantFaDingDaiBiaoZhjianHaoma_))
except (ValueError, IndexError):
# 没有类型则为 ""
caseLitigantFaDingDaiBiaoZhjianHaomaArr.append("")
i += 1
printLn("================添加当事人({})信息结束==================".format(caseLitigantName))
# 案件当事人信息拼接
# 更新案件当事人信息
dsr_name_ = "" # 当事人名称
# dsr_name_yuangao = "" # 原告
# dsr_name_beigao = "" # 被告
# dsr_name_disanren = "" # 第三人
# dsr_name_arr_ = list()
# if len(caseLitigantNameYuangaoArr) > 0:
# dsr_name_yuangao += '原告:' + ",".join(caseLitigantNameYuangaoArr)
# dsr_name_arr_.append(dsr_name_yuangao)
# if len(caseLitigantNameBeigaoArr) > 0:
# dsr_name_beigao += '被告:' + ",".join(caseLitigantNameBeigaoArr)
# dsr_name_arr_.append(dsr_name_beigao)
# if len(caseLitigantNameDisanrenArr) > 0:
# dsr_name_disanren += '第三人:' + ",".join(caseLitigantNameDisanrenArr)
# dsr_name_arr_.append(dsr_name_disanren)

# dsr_name_ = ";".join(dsr_name_arr_)
dsr_name_ = ";".join(caseLitigantNameArr)
caseLitigantType = ";".join(caseLitigantTypeArr)
caseLitigantZhjianLeixing = ";".join(caseLitigantZhjianLeixingArr)
caseLitigantZhjianHaoma = ";".join(caseLitigantZhjianHaomaArr)
caseLitigantDizhi = ";".join(caseLitigantDizhiArr)
caseLitigantLianxiFangshi = ";".join(caseLitigantLianxiFangshiArr)
caseLitigantFaDingDaiBiao = ";".join(caseLitigantFaDingDaiBiaoArr)
caseLitigantFaDingDaiBiaoZhLeixing = ";".join(caseLitigantFaDingDaiBiaoZhLeixingArr)
caseLitigantFaDingDaiBiaoZhjianHaoma = ";".join(caseLitigantFaDingDaiBiaoZhjianHaomaArr)
printLn("================案件最终当事人信息开始===================")
printLn("当事人名称:{}".format(dsr_name_))
printLn("当事人类型:{}".format(caseLitigantType))
printLn("当事人证件类型:{}".format(caseLitigantZhjianLeixing))
printLn("证件号码:{}".format(caseLitigantZhjianHaoma))
printLn("地址:{}".format(caseLitigantDizhi))
printLn("联系电话(当事人):{}".format(caseLitigantLianxiFangshi))
printLn("法定代表人或主要负责人:{}".format(caseLitigantFaDingDaiBiao))
printLn("法定代表人证件类型:{}".format(caseLitigantFaDingDaiBiaoZhLeixing))
printLn("法定代表人证件号:{}".format(caseLitigantFaDingDaiBiaoZhjianHaoma))
printLn("================案件最终当事人信息结束===================")
case['dsr'] = dsr_name_
case['caseLitigantType'] = caseLitigantType
case['caseLitigantZhjianLeixing'] = caseLitigantZhjianLeixing
case['caseLitigantZhjianHaoma'] = caseLitigantZhjianHaoma
case['caseLitigantDizhi'] = caseLitigantDizhi
case['caseLitigantLianxiFangshi'] = caseLitigantLianxiFangshi
case['caseLitigantFaDingDaiBiao'] = caseLitigantFaDingDaiBiao
case['caseLitigantFaDingDaiBiaoZhLeixing'] = caseLitigantFaDingDaiBiaoZhLeixing
case['caseLitigantFaDingDaiBiaoZhjianHaoma'] = caseLitigantFaDingDaiBiaoZhjianHaoma

printLn("================解析案号为{}民事当事人页面信息结束================".format(case.get("ah", None)))
return case


# 解析开庭页面信息, div: 元素, case: 案件字典
def parseCaseKaiTingShijianFormHtml(div, case):
printLn("================解析案号为{}开庭页面信息开始================".format(case.get("ah", None)))
tbody = div.table.tbody
trs = tbody.children
i = 0
kaiting_starttime = "" # 开庭开始时间
kaiting_endtime = "" # 开庭结束时间
for tr in trs:
if tr.name != "tr":
# 忽略非 tr
continue
if i == 0:
i += 1
continue
if i == 1:
value_list = list()
td_value_list = tr.select("td[class='value']")
for td_value in td_value_list:
value_list.append(td_value.text)
printLn("开庭页面信息:" + str(value_list))
try:
kaiting_starttime = value_list[1]
kaiting_endtime = value_list[2]
printLn("开庭开始时间:{}".format(kaiting_starttime))
printLn("开庭结束时间:{}".format(kaiting_endtime))
except IndexError:
pass
break

printLn("================解析案号为{}开庭页面信息结束================".format(case.get("ah", None)))
case['kaiting_starttime'] = kaiting_starttime
case['kaiting_endtime'] = kaiting_endtime
return case


# 民事审判组织及其他成员页,div: 元素, case: 案件字典
def parseCaseShenpanZuzhiFormHtml(div, case):
printLn("================解析案号为{}民事审判组织及其他成员页开始================".format(case.get("ah", None)))
tbody = div.table.tbody
trs = tbody.children
i = 0
shenpanyuan = "" # 审判员
renminpeishenyuan = ""
renminpeishenyuanArr = list() # 人民陪审员
dailishenpanyuan = "" # 代理审判员
dailipeishenyuan = ""
dailipeishenyuanArr = list() # 代理陪审员
shujiyuan = "" # 书记员
for tr in trs:
if tr.name != "tr":
# 忽略非 tr
continue
if i == 0 or i % 2 != 0:
i += 1
continue
name_list = list()
value_list = list()
printLn("index====={}".format(i))
td_name_list = tr.select("td[class='name']")
for td_name in td_name_list:
name_list.append(td_name.text)
td_value_list = tr.select("td[class='value']")
for td_value in td_value_list:
value_list.append(td_value.text)
i += 1
printLn("========name_list value_list===========")
printLn(name_list)
printLn(value_list)
printLn("========name_list value_list===========")
name = "" # 姓名
try:
dsr_index = name_list.index("姓名")
name = value_list[dsr_index]
name = strReplace(name)
printLn("姓名:{}".format(name))
except (ValueError, IndexError):
pass
try:
dsr_index = name_list.index("角色")
juese = value_list[dsr_index] # 角色
juese = strReplace(juese) # 去掉空格
printLn("角色:{}".format(juese))
if "审判员" == juese or "审判长" == juese:
shenpanyuan = name
elif "书记员" == juese:
shujiyuan = name
elif "代理审判员" == juese:
dailishenpanyuan = name
elif "人民陪审员" == juese:
renminpeishenyuanArr.append(name)
elif "代理陪审员" == juese:
dailipeishenyuanArr.append(name)
else:
pass
except (ValueError, IndexError):
pass
renminpeishenyuan = ";".join(renminpeishenyuanArr)
dailipeishenyuan = ";".join(dailipeishenyuanArr)
printLn("审判员:{}".format(shenpanyuan))
printLn("人民陪审员:{}".format(renminpeishenyuan))
printLn("代理审判员:{}".format(dailishenpanyuan))
printLn("代理陪审员:{}".format(dailipeishenyuan))
printLn("书记员:{}".format(shujiyuan))
printLn("================解析案号为{}民事审判组织及其他成员页结束================".format(case.get("ah", None)))
case['shenpanyuan'] = shenpanyuan
case['renminpeishenyuan'] = renminpeishenyuan
case['dailishenpanyuan'] = dailishenpanyuan
case['dailipeishenyuan'] = dailipeishenyuan
case['shujiyuan'] = shujiyuan

return case


# 解析案件详情页, f: 案件详情页展开内容, case: 案件字典
def parseCaseDetail(div, case):
printLn("================解析案号为{}案件详情页开始================".format(case.get("ah", "")))
soup = BeautifulSoup(div, "html.parser")
an_els = soup.select("#C_AH") # 案号
ah = an_els[0].text
ah = strReplace(ah)
if case['ah'] != ah:
return case
printLn("案号为:{}".format(ah))
try:
sycx_els = soup.select("#N_SYCX") # 适用程序
sycx = sycx_els[0].text
sycx = strReplace(sycx)
case['sycx'] = sycx
except Exception as e:
case['sycx'] = ""
printLn("适用程序为:{}".format(case['sycx']))
try:
anjy_els = soup.select("#N_AJLYFB") # 案件来源
ajly = anjy_els[0].text
ajly = strReplace(ajly)
case['ajly'] = ajly
except:
case['ajly'] = ""
printLn("案件来源为:{}".format(case['ajly']))
soup.select("a[table='T_MS_DSR']")
biaoti_els = soup.select(".biaoti")
dsr_content_div = None # 当事人信息页
kaiting_content_div = None # 开庭时间
shenpanzhuzhi_content_div = None # 民事审判组织及其他成员
for biaoti_el in biaoti_els:
biaoti = biaoti_el.div.div.div.text
if biaoti == "民事当事人":
printLn(biaoti)
dsr_content_divs = biaoti_el.next_siblings
i = 0
for x in dsr_content_divs:
if i == 0:
dsr_content_div = x
printLn("dsr_content_div===================================")
printLn(dsr_content_div)
try:
parseCaseLitigantInfoFormHtml(dsr_content_div, case)
except Exception as e:
logger.error(traceback.print_exc(e))
traceback.print_exc(file=sys.stdout)
finally:
break
i += 1
if biaoti == "民事开庭":
printLn(biaoti)
mskt_content_divs = biaoti_el.next_siblings
j = 0
for x in mskt_content_divs:
if j == 0:
kaiting_content_div = x
try:
parseCaseKaiTingShijianFormHtml(kaiting_content_div, case)
except Exception as e:
logger.error(traceback.print_exc(e))
traceback.print_exc(file=sys.stdout)
finally:
break
j += 1
if biaoti == "民事审判组织及其他成员":
printLn(biaoti)
spzzcy_content_divs = biaoti_el.next_siblings
k = 0
for x in spzzcy_content_divs:
if k == 0:
shenpanzhuzhi_content_div = x
try:
parseCaseShenpanZuzhiFormHtml(shenpanzhuzhi_content_div, case)
except Exception as e:
logger.error(traceback.print_exc(e))
traceback.print_exc(file=sys.stdout)
finally:
break
k += 1
printLn("================解析案号为{}案件详情页结束================".format(case.get("ah", "")))

return case


# 解析案件详情页获取案件部分信息:适用程序、案件来源
# f: 详情页 html 标签中 id=rightDiv 的内嵌html, case: 案件字典
def parseCaseDetailAndSetCase(f, case):
soup = BeautifulSoup(f, "html.parser")
an_els = soup.select("#C_AH") # 案号
ah = an_els[0].text
case['ah'] = ah
printLn("案号为:{}".format(ah))
try:
sycx_els = soup.select("#N_SYCX") # 适用程序
sycx = sycx_els[0].text
case['sycx'] = sycx
except:
case['sycx'] = ""
printLn("适用程序为:{}".format(case['sycx']))
try:
anjy_els = soup.select("#N_AJLY") # 案件来源
ajly = anjy_els[0].text
case['ajly'] = ajly
except:
case['ajly'] = ""
printLn("案件来源为:{}".format(case['ajly']))

return case


# 初始化浏览器窗口,并打开登录页. driver_path: 浏览器驱动
def initBrowerWindowAndLoginPage(driver_path):
printLn("初始化窗口............... 0")
# 1.创建Chrome浏览器对象,这会在电脑上在打开一个浏览器窗口
browser = webdriver.Chrome(executable_path=driver_path)
# 2.通过浏览器向服务器发送URL请求
browser.get("http://132.4.1.15/sym/")
printLnWindowTitle(browser)
# 3.刷新浏览器
browser.refresh()
# 4.设置浏览器的大小
browser.set_window_size(1400, 800)
sleep(3)
printLn("初始化窗口结束............... ")
return browser


# wd: webdriver, title: 窗口标题
# 切换到窗口标题为 title 的窗口上
def switchWindow(wd, title):
for handle in wd.window_handles:
wd.switch_to.window(handle)
if title in wd.title:
break


# 打印窗口标题
def printLnWindowTitle(wd):
printLn("当前页面是:{}".format(wd.title))


# 登陆, wd: webdriver
def login(wd, username, password):
printLn("登陆开始...................1")
start_time = time()
username_input = wd.find_element_by_id("username1")
username_input.clear()
username_input.send_keys(username)
password_input = wd.find_element_by_id("password1")
password_input.clear()
password_input.send_keys(password)
login_btn = wd.find_element_by_xpath("//a[@class='submit_btn']")
login_btn.click()
printLn("登陆结束...................2")
end_time = time()
printLn("登陆花了{}s".format(end_time - start_time))


# 登陆后为 数字法院业务应用系统页面,需再点击左边导航栏的案件查询按钮
def switchToAjcxPage(wd):
printLn("开始获取数字法院业务应用系统页面 3")
start_time = time()
# 休眠 3s, 等待登陆成功
sleep(3)
wd.get("http://132.4.1.50:3080/ajcx/")
printLnWindowTitle(wd)

# 切换到 leftFrame 这个 iFrame
try:
wd.switch_to.frame("leftFrame")
caseQueryA = wd.find_element_by_id("caseQueryA")
caseQueryA.click()
wd.switch_to.default_content()
except Exception as e:
# FIXME: 页面元素不一样???
printLn("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXx")
# sleep(1000)
# traceback.print_exc(file=sys.stdout)
caseQueryA = wd.find_element_by_xpath("//*[@id='leftDivList']/div[2]/div[1]")
caseQueryA.click()
printLnWindowTitle(wd)
printLn("点击案件查询按钮 4")
printLn("跳转到数字法院业务应用系统页面结束...................2")
end_time = time()
printLn("登陆页跳转到数字法院业务应用系统页面花了{}s".format(end_time - start_time))


# 设置搜索条件为民事案件、设置需要过滤的案件收案区间
# wd: webdriver, sarq: 收案日期区间头, sarqend: 收案日期区间区间尾
def doMinShiCaseSearch(wd, sarq, sarqend):
start_time = time()
printLn("设置搜索条件开始...................2")
sleep(3)
# 切入到案件查询表单的 iframe, 设置过滤条件
wd.switch_to.frame("rightFrame")
# 设置案件类别为民事案件
ajlb_input = wd.find_element_by_xpath("//*[@id='faCodeNAjlb']/../../td[3]/img")
ajlb_input.click()
sleep(1)
# x_tree_root_node = wd.find_element_by_class_name("x-tree-root-node")
# printLn(x_tree_root_node.get_attribute("innerHTML"))
# 设置案件类别为民事案件
ajlb_minshi = wd.find_element_by_xpath("//*[@class='x-tree-root-node']/li[2]/div/input")
ajlb_minshi.click()

# 确定按钮
ensure_btns = wd.find_elements_by_class_name("x-btn-mc")
ensure_btns[1].click()

# 取消审判程序
# FIXME: 审判程序?
cpcx_input = wd.find_element_by_xpath("//*[@id='faCodeNSpcx']/../../td[2]/img")
cpcx_input.click()
sleep(1)
# for cpcx_input in cpcx_inputs:
# cpcx_input.send_keys("")
# 设置收案日期区间,必填项,sarq_input 为开始日期
# 收案日期区间区间头
sarq_input = wd.find_element_by_id("faDateDsarq")
# 设置收案日期区间头
sarq_input.clear()
sarq_input.send_keys(sarq)
# 收案日期区间区间尾
sarqend_input = wd.find_element_by_id("faDateDsarqend")
# 设置收案日期区间尾
sarqend_input.clear()
sarqend_input.send_keys(sarqend)
# 获取查询按钮
search_case_btn = wd.find_element_by_css_selector("button[onfocus='Artery.initItem(Artery.cfg_buttonSearch)']")
search_case_btn.click()

# 查询需要一段时间,让线程等待 10s
sleep(10)
printLn("设置搜索条件结束...................2")
end_time = time()
printLn("设置搜索条件花了{}s".format(end_time - start_time))


# 获取当前分页的案件信息, browser: webdriver, currpageno: 当前分页页码
def getCurrentPageCasesInfo(browser, currpageno):
printLn("正在获取查询结果第{}页案件信息=====================".format(currpageno))
# 查询结果页
search_result_table = browser.find_element_by_id("listAreaResult-atbody")
# 解析案件查询结果页,返回案件信息列表
cases_info = parseCaseResult(search_result_table.get_attribute("innerHTML"))
for case in cases_info:
printLn("列表页案件{}信息".format(case.get("ah", "")))
printLn(case)
from selenium.webdriver.remote.webelement import WebElement
case_detail_links = browser.find_elements_by_class_name("x-btn-simple-link")
page_case_len = len(case_detail_links)
printLn("当前页案件结果数为: ======={}".format(page_case_len))
old_win2 = browser.current_window_handle
# 当前分页page={Num}爬取的案件数量
case_count = 0
for case_index in range(page_case_len):
case_start_time = time()
printLn("正在爬取第{}页第{}个案件".format(currpageno, case_count + 1))
case = cases_info[case_index]
if case_index != 0:
# 因为关闭详情页,所以需要重新定位
browser.switch_to.frame("rightFrame")
case_detail_links = browser.find_elements_by_class_name("x-btn-simple-link")
printLn("case_index==================" + str(case_index))
detail_link = case_detail_links[case_index]
detail_link.click()
switchWindow(browser, case['ah'])
printLn("当前标题为:=================" + browser.title)
browser.switch_to.frame("mainContent")
sleep(1)
# 一键展开所有详情
browser.find_element_by_id("zkqb").click()
sleep(2)
# # 民事当事人(操作)展开全部信息
try:
MS_DSR_btn = browser.find_element_by_id("all_detail_T_MS_DSR")
if MS_DSR_btn.text == "展开全部":
MS_DSR_btn.click()
except:
pass
# # 民事审判组织及其他成员(操作)展开全部信息
try:
MS_SPZZCY_btn = browser.find_element_by_id("all_detail_T_MS_SPZZCY")
if MS_SPZZCY_btn.text == "展开全部":
MS_SPZZCY_btn.click()
except:
pass
# rightDiv 为详情页右边栏
rightDiv = browser.find_element_by_id("rightDiv")
rightDivInnerHTML = rightDiv.get_attribute("innerHTML")
# printLn(rightDivInnerHTML)
# 设置案件来源和适用程序
# parseCaseDetailAndSetCase(rightDivInnerHTML, case)
# 获取案件来源,适用程序,当事人信息,开庭时间,陪审员信息
printLn("解析案件{}的案件来源,适用程序,当事人信息,开庭时间,陪审员信息开始".format(case.get("ah", "")))
parseCaseDetail(rightDivInnerHTML, case)
printLn("解析案件{}的案件来源,适用程序,当事人信息,开庭时间,陪审员信息结束".format(case.get("ah", "")))
printLn("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")
printLn(case)
printLn("YYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYYY")
browser.switch_to.default_content()
printLn("爬取第{}页第{}个案件完成".format(currpageno, case_count + 1))
case_count += 1
# 关闭详情页
browser.close()
browser.switch_to.window(old_win2)
case_end_time = time()
printLn("爬取案件{}花费了{}s====================".format(case['ah'], case_end_time - case_start_time))
printLn("获取查询结果第{}页案件信息结束=====================".format(currpageno))
return cases_info


# currpageno:当前页页码, rowsperpage:每页的数量, current_case_list: 当前页的案件信息
# sheet: excel 的 sheet
# 将案件信息写入 excel
def writeCasesToExcel(currpageno, rowsperpage, current_case_list, sheet):
start_index = 2 + (currpageno - 1) * rowsperpage # 开始写入的下标
end_index = 2 + currpageno * rowsperpage # 最后一个下标
# 当前案件列表长度
current_case_list_len = len(current_case_list)
for i in range(current_case_list_len):
case = current_case_list[i]
printLn("正在写入第{}个案件,案号为{}".format(start_index + i - 1, case.get("ah", "")))
sheet.write(start_index + i, 0, start_index + i - 1);
sheet.write(start_index + i, 1, case.get("ah", ""))
sheet.write(start_index + i, 2, case.get("ajlb", ""));
sheet.write(start_index + i, 3, case.get("spac", ""))
sheet.write(start_index + i, 4, case.get("dsr", ""));
sheet.write(start_index + i, 5, case.get("sarq", ""))
sheet.write(start_index + i, 6, case.get("lar", ""));
sheet.write(start_index + i, 7, case.get("larq", ""))
sheet.write(start_index + i, 8, case.get("laay", ""));
sheet.write(start_index + i, 9, case.get("cbspt", ""))
sheet.write(start_index + i, 10, case.get("cbr", ""));
sheet.write(start_index + i, 11, case.get("Tqbjrq", ""))
sheet.write(start_index + i, 12, case.get("jarq", ""));
sheet.write(start_index + i, 13, case.get("ShifouGuidang", ""))
sheet.write(start_index + i, 14, case.get("jaay", ""));
sheet.write(start_index + i, 15, case.get("jafs", ""))
sheet.write(start_index + i, 16, case.get("SFSZMQ", ""));
sheet.write(start_index + i, 17, case.get("pczt", ""))
sheet.write(start_index + i, 18, case.get("jingbanfayuan", ""));
sheet.write(start_index + i, 19, case.get("Ysah", ""))
sheet.write(start_index + i, 20, case.get("YSFY", ""));
sheet.write(start_index + i, 21, case.get("YSJAFS", ""))
sheet.write(start_index + i, 22, case.get("GDRQ", ""));
sheet.write(start_index + i, 23, case.get("GDR", ""))
sheet.write(start_index + i, 24, case.get("SXQK", ""));
sheet.write(start_index + i, 25, case.get("SFCSX", ""))
sheet.write(start_index + i, 26, case.get("CSXTS", ""));
sheet.write(start_index + i, 27, case.get("Jfzje", ""))
sheet.write(start_index + i, 28, case.get("Ktdd", ""));
sheet.write(start_index + i, 29, case.get("sycx", ""))
sheet.write(start_index + i, 30, case.get("ajly", ""));
sheet.write(start_index + i, 31, case.get("caseLitigantType", ""))
sheet.write(start_index + i, 32, case.get("caseLitigantZhjianLeixing", ""));
sheet.write(start_index + i, 33, case.get("caseLitigantZhjianHaoma", ""))
sheet.write(start_index + i, 34, case.get("caseLitigantDizhi", ""));
sheet.write(start_index + i, 35, case.get("caseLitigantLianxiFangshi", ""))
sheet.write(start_index + i, 36, case.get("caseLitigantFaDingDaiBiao", ""));
sheet.write(start_index + i, 37, case.get("caseLitigantFaDingDaiBiaoZhLeixing", ""))
sheet.write(start_index + i, 38, case.get("caseLitigantFaDingDaiBiaoZhjianHaoma", ""));
sheet.write(start_index + i, 39, case.get("kaiting_starttime", ""))
sheet.write(start_index + i, 40, case.get("kaiting_endtime", ""));
sheet.write(start_index + i, 41, case.get("shenpanyuan", ""));
sheet.write(start_index + i, 42, case.get("renminpeishenyuan", ""));
sheet.write(start_index + i, 43, case.get("dailipeishenyuan", ""));
sheet.write(start_index + i, 44, case.get("dailipeishenyuan", ""));
sheet.write(start_index + i, 45, case.get("shujiyuan", ""));


# 案件查询页面结果分页查询, browser: webdriver,sheet: excel 的 sheet
def ajcxCaseResultPageParse(browser, sheet):
printLn("案件查询页面结果分页查询开始==================================")
# 查询结果页
search_result_table = browser.find_element_by_id("listAreaResult-atbody")
printLn(type(search_result_table)) # selenium.webdriver.remote.webelement.WebElement
# 查询结果案件总数
# totalcount = search_result_table.get_attribute("totalcount")
pagingbar = browser.find_element_by_xpath("//*[@id='pagingbar']/span[1]/span[1]")
totalcount = int(pagingbar.text)
printLn("查询结果案件总数为{}................".format(totalcount))
# 当前分页信息
rowsperpage = search_result_table.get_attribute("rowsperpage") # 每页记录数目
if rowsperpage is None:
rowsperpage = 20 # 默认是 20
printLn("当前分页每页数量为{}..................".format(rowsperpage))
pagecount = (totalcount + rowsperpage - 1) // rowsperpage # 分页总数
currpageno = 1 # 当前分页编码
# datacount = search_result_table.get_attribute("datacount") # 当前页的实际记录数目
printLn("当前查询页编码为{}..................".format(currpageno))
# if datacount is not None:
# printLn("当前页的实际记录数目{}..................".format(datacount))
# 获取当前分页的案件信息
# 当前页案件信息
current_case_list = getCurrentPageCasesInfo(browser, currpageno)
writeCasesToExcel(currpageno, rowsperpage, current_case_list, sheet)
currpageno += 1
iframe_flag = 0 # iframe_flag = 0 表示默认, iframe_flag = 1 要切回去
while currpageno <= pagecount:
printLnWindowTitle(browser)
pagingbar2_btns = browser.find_elements_by_class_name("x-pagingbar2-btn")
# 防止找不到下一页按钮
if len(pagingbar2_btns) == 0:
browser.switch_to.frame("rightFrame")
iframe_flag = 1
pagingbar2_btns = browser.find_elements_by_class_name("x-pagingbar2-btn")
printLn("第{}页的按钮(class='x-pagingbar2-btn')长度为{}".format(currpageno - 1, len(pagingbar2_btns)))
nextpage_btn = None # 下一页按钮
for btn in pagingbar2_btns:
if btn.text == "下一页":
nextpage_btn = btn
if len(pagingbar2_btns) > 1:
nextpage_btn = pagingbar2_btns[1]
if nextpage_btn is not None:
nextpage_btn.click() # 点击下一页
sleep(5) # FIXME: 5s or 10s?
current_case_list_2 = getCurrentPageCasesInfo(browser, currpageno)
writeCasesToExcel(currpageno, rowsperpage, current_case_list_2, sheet)
# 要切回去
if iframe_flag == 1:
browser.switch_to.default_content()
iframe_flag = 0 # 还原 iframe_flag
currpageno += 1
printLn("案件查询页面结果分页查询开始==================================")


# sarq: 案件的收案日期开始, sarqend:案件的收案日期结束,sheet: excel 的 sheet
# 根据案件的收案日期区间过滤案件
def search_case_by_sarq(browser, username, password, sarq, sarqend, sheet):
start_time = time()

# 输入用户名、密码,开始模拟登陆
login(browser, username, password)
# 登陆后跳转到 数字法院业务应用系统
switchToAjcxPage(browser)

# 备份旧的窗口句柄
old_win = browser.current_window_handle
# 切换到案件查询窗口
switchWindow(browser, "案件查询")
# browser.s
printLnWindowTitle(browser)

# 切换到综合查询
zhcx_btn = browser.find_element_by_id("zhcx")
zhcx_btn.click()

# =============================================
# 设置案件查询页查询条件
doMinShiCaseSearch(browser, sarq, sarqend)
# =============================================

# =============================================
# 解析案件查询结果页信息,涉及分页
ajcxCaseResultPageParse(browser, sheet)
# =============================================

end_time = time()
printLn("总共花费了 {} s".format(end_time - start_time))
printLn("爬取民事案件:收案区间({}-{})完成".format(sarq, sarqend))


# 设置 excel 表头
def writeExcelHeader(sheet):
sheet.write(0, 0, "华宇综合查询结果")
sheet.write(1, 0, "");
sheet.write(1, 1, "案号");
sheet.write(1, 2, "案件类别");
sheet.write(1, 3, "审判程序");
sheet.write(1, 4, "当事人");
sheet.write(1, 5, "收案日期");
sheet.write(1, 6, "立案人");
sheet.write(1, 7, "立案日期");
sheet.write(1, 8, "立案案由");
sheet.write(1, 9, "承办审判庭");
sheet.write(1, 10, "承办人");
sheet.write(1, 11, "提请报结日期");
sheet.write(1, 12, "结案日期");
sheet.write(1, 13, "是否归档");
sheet.write(1, 14, "结案案由");
sheet.write(1, 15, "结案方式");
sheet.write(1, 16, "是否涉自贸区");
sheet.write(1, 17, "评查状态");
sheet.write(1, 18, "经办法院");
sheet.write(1, 19, "原审案号");
sheet.write(1, 20, "原审法院");
sheet.write(1, 21, "原审结案方式");
sheet.write(1, 22, "归档日期");
sheet.write(1, 23, "归档人");
sheet.write(1, 24, "审限情况");
sheet.write(1, 25, "是否超审限");
sheet.write(1, 26, "超审限天数");
sheet.write(1, 27, "已交费(元)");
sheet.write(1, 28, "开庭地点");
sheet.write(1, 29, "适用程序");
sheet.write(1, 30, "案件来源");
sheet.write(1, 31, "当事人类型");
sheet.write(1, 32, "当事人证件类型");
sheet.write(1, 33, "当事人证件号码");
sheet.write(1, 34, "当事人地址");
sheet.write(1, 35, "当事人联系方式");
sheet.write(1, 36, "法定代表人");
sheet.write(1, 37, "法定代表人证件类型");
sheet.write(1, 38, "法定代表人证件号");
sheet.write(1, 39, "开庭开始时间");
sheet.write(1, 40, "开庭结束时间");
sheet.write(1, 41, "审判员");
sheet.write(1, 42, "人民陪审员");
sheet.write(1, 43, "代理审判员");
sheet.write(1, 44, "代理陪审员");
sheet.write(1, 45, "书记员");


# driver_path: 驱动目录,username: 用户名, password:密码,
# sarq:收案日期区间头, sarqend:收案日期区间尾, out_dir: 输出 excel 的路径
def crawl(driver_path, username, password, sarq, sarqend, out_dir):
# 初始化
browser = initBrowerWindowAndLoginPage(driver_path)
work_book = xlwt.Workbook(encoding="utf-8")
sheet = work_book.add_sheet("Sheet0")
writeExcelHeader(sheet)
search_case_by_sarq(browser, username, password, sarq, sarqend, sheet)
# try:
# pass
# except Exception as e:
# logger.error(traceback.print_exc(e))
# traceback.print_exc(file=sys.stdout)
# finally:
# # 完成后退出浏览器
# browser.quit()
out_path = "".join([out_dir, "/", "华宇民事案件综合查询结果{}.xls".format(int(1000 * time()))])
# out_dir 不存在则创建
if not os.path.exists(out_dir):
os.makedirs(out_dir)
work_book.save(out_path)
printLn("写入Excel 文件{}完成".format(out_path))


# 每天自动爬取数据,默认审判程序为'一审',收案区间为
def autoCrawlPerDay():
config = json.load(open("config.json", encoding="utf-8"))
driver_path = config['driver_path']
username = config['username']
password = config['password']
today = datetime.date.today()
yesterday = today - datetime.timedelta(days=1)
# sarq = "2016-06-16"
# sarqend = "2016-06-16"
sarq = "-".join([str(yesterday.year), str(yesterday.month), str(yesterday.day)])
sarqend = "-".join([str(today.year), str(today.month), str(today.day)])
out_dir = config['out_dir']
crawl(driver_path, username, password, sarq, sarqend, out_dir)


def main(driver_path, username, password, sarq, sarqend, out_dir):
crawl(driver_path, username, password, sarq, sarqend, out_dir)


if __name__ == '__main__':
printLn("开始启动爬虫脚本,请静心等候......")
# sarq = input("请输入案件的收案日期从(格式 2020-09-15):")
# printLn(sarq)
# sarqend = input("请输入案件的收案日期到(格式 2020-09-30):")
# out_file = open("")
config = json.load(open("config.json", encoding="utf-8"))
driver_path = config['driver_path']
username = config['username']
password = config['password']
# sarq = "2016-06-16"
# sarqend = "2016-06-16"
sarq = "2020-01-16"
sarqend = "2020-01-16"
# sarq = "2020-03-03"
# sarqend = "2020-03-03"
out_dir = config['out_dir']
main(driver_path, username, password, sarq, sarqend, out_dir)

log_conf.xml

version: 1
disable_existing_loggers: False
formatters:
verbose:
format: '%(levelname)s %(asctime)s %(pathname)s %(filename)s %(module)s %(funcName)s %(lineno)d: %(message)s'
simple:
format: '%(levelname)s %(message)s'
handlers:
file_handler:
class: logging.handlers.TimedRotatingFileHandler
level: INFO
formatter: verbose
filename: log.log
when: D
console:
class: logging.StreamHandler
level: INFO
formatter: verbose
stream: ext://sys.stdout
loggers:
simpleExample:
level: WARNING
handlers: [console]
propagate: no
root:
level: DEBUG
handlers: [console, file_handler]

4. 代码的关键点分析

可以看到代码是非常清晰的,但是这并不是一开始就是的,我大概花了一到两天的时间写了这个系统的爬虫,但是解析 html 的时间花了我3-4 天,因为页面标签的解析比较麻烦,还有后续一些调试操作。解析主要用到 BeautifulSoup 和 xpath, xpath 这个谷歌浏览器本身已经支持了,所以省去很多操作。

5. 值得看的教程

目前,单个机器的爬虫性能有限,实际生产中经常使用分布式爬虫,而支持的比较好的是 Scrapy 教程。当然在 Python 中, 还有 http 请求库 requests 和 解析 html/xml 的库 BeatifulSoup

Scrapy 官网

B 站视频-Python + Selenium Web自动化 全套教程 自动化测试 软件测试

Requests: 让 HTTP 服务人类

Beautiful Soup 4.4.0 文档

对于 requests 库函数, 最重要的是掌握其带 cookies 的方法和保持会话的方法。

带 cookies:

requests Cookie

如果某个响应中包含一些 cookie,你可以快速访问它们:

>>> url = 'http://example.com/some/cookie/setting/url'
>>> r = requests.get(url)

>>> r.cookies['example_cookie_name']
'example_cookie_value'

要想发送你的cookies到服务器,可以使用 cookies 参数:

>>> url = 'http://httpbin.org/cookies'
>>> cookies = dict(cookies_are='working')

>>> r = requests.get(url, cookies=cookies)
>>> r.text
'{"cookies": {"cookies_are": "working"}}'

Cookie 的返回对象为 RequestsCookieJar,它的行为和字典类似,但接口更为完整,适合跨域名跨路径使用。你还可以把 Cookie Jar 传到 Requests 中:

>>> jar = requests.cookies.RequestsCookieJar()
>>> jar.set('tasty_cookie', 'yum', domain='httpbin.org', path='/cookies')
>>> jar.set('gross_cookie', 'blech', domain='httpbin.org', path='/elsewhere')
>>> url = 'http://httpbin.org/cookies'
>>> r = requests.get(url, cookies=jar)
>>> r.text
'{"cookies": {"tasty_cookie": "yum"}}'

参见 cookies

requests 保持会话

会话对象

会话对象让你能够跨请求保持某些参数。它也会在同一个 Session 实例发出的所有请求之间保持 cookie, 期间使用 urllib3connection pooling 功能。所以如果你向同一主机发送多个请求,底层的 TCP 连接将会被重用,从而带来显著的性能提升。 (参见 HTTP persistent connection).

会话对象具有主要的 Requests API 的所有方法。

我们来跨请求保持一些 cookie:

s = requests.Session()

s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')
r = s.get("http://httpbin.org/cookies")

print(r.text)
# '{"cookies": {"sessioncookie": "123456789"}}'

会话也可用来为请求方法提供缺省数据。这是通过为会话对象的属性提供数据来实现的:

s = requests.Session()
s.auth = ('user', 'pass')
s.headers.update({'x-test': 'true'})

# both 'x-test' and 'x-test2' are sent
s.get('http://httpbin.org/headers', headers={'x-test2': 'true'})

任何你传递给请求方法的字典都会与已设置会话层数据合并。方法层的参数覆盖会话的参数。

不过需要注意,就算使用了会话,方法级别的参数也不会被跨请求保持。下面的例子只会和第一个请求发送 cookie ,而非第二个:

s = requests.Session()

r = s.get('http://httpbin.org/cookies', cookies={'from-my': 'browser'})
print(r.text)
# '{"cookies": {"from-my": "browser"}}'

r = s.get('http://httpbin.org/cookies')
print(r.text)
# '{"cookies": {}}'

如果你要手动为会话添加 cookie,就使用 Cookie utility 函数 来操纵 Session.cookies

会话还可以用作前后文管理器:

with requests.Session() as s:
s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')

这样就能确保 with 区块退出后会话能被关闭,即使发生了异常也一样。

参见 会话对象

一般使用 with 关键字保持会话,

with requests.Session() as s:
// 若干请求
s.get('http://httpbin.org/cookies/set/sessioncookie/123456789')

No comments:

Post a Comment

Spring 知识

编程性事务 注意.使用Spring自带的事,在抛RuntimeException时会自动回演,代码抛其他异常不会回滚。 国结批处理中若需要支持回滚,可使用以下方法 import org . springframework . jdbc . datasource . DataSo...