python爬虫爬取网页取得html并处理数据写入文件

来源：网络用户发布，如有版权联系网管删除　2020-10-06　

 import re #import urllib import urllib.request import chardet # 需要导入这个模块，检测编码格式  string = "<a xxxxxxxxxxxxxxxxxxxxxxxxentry >某某内容</a>for aaaaaaaaaaaaaaaaaa"  result = re.findall(".*entry >(.*)</a>f.*", string) def spider(): '''  爬虫的主调度器  '''  isflow = True # 判断是否进行下一页  page = 1  while isflow: url="http://www.dianzixuexi.com"  #url = "http://www.dianzixuexi.com/article/list_5_" + str(page) + ".html"  print(url) html = load(url) deal(html, page) panduan = input("是否继续(y/n)!") if panduan == "y": isflow = True  page += 1  else: isflow = False  def load(url): header = { "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"  } request = urllib.request.Request(url, headers=header) response = urllib.request.urlopen(request) html = response.read() return html def deal(html,page): '''  对之前爬去的内容进行正则匹配，匹配出标题和正文内容  :param html:之前爬去的内容  :param page: 正在爬去的页码  '''   #python3编码和python2不一样，解决编译时cannot use a string pattern on a bytes-like object问题  encode_type = chardet.detect(html) html = html.decode(encode_type['encoding']) # 进行相应解码，赋给原标识符（变量）   #parrten=re.compile('<li class="piclist\d+">(.*?)</li>',re.S)  parrten = re.compile('">(.*?)</a>', re.S) titleList=parrten.findall(html) for title in titleList: parrten1=re.compile('<a href="/article/\d+.html" rel="external nofollow" >(.*)</a>') ti1=parrten1.findall(title) parrten2=re.compile('<div class="f18 mb20">(.*?)</div>',re.S) til2=parrten2.findall(title) for t in ti1: tr=t.replace("<b>","").replace("</b>","") writeData(tr,page) for t in til2: tr=t.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo","\"").replace("&rdquo","\"") writeData(tr,page) def writeData(context,page): '''  将最终爬去的内容写入文件中  :param context: 匹配好的内容  :param page: 当前爬去的页码数  '''  fileName = "di" + str(page) + "yehtml.txt"  with open(fileName, "a") as file: file.writelines(context + "\n") for x in result: print(x) # resp = urllib.request.urlopen('http://www.baidu.com')  html2=load('http://www.dianzixuexi.com') print (html2) #deal(html2,1)   spider()