python爬虫爬取网页取得html并处理数据写入文件
来源: 网络用户发布,如有版权联系网管删除 2020-10-06
import re #import urllib import urllib.request import chardet # 需要导入这个模块,检测编码格式 string = "<a xxxxxxxxxxxxxxxxxxxxxxxxentry >某某内容</a>for aaaaaaaaaaaaaaaaaa" result = re.findall(".*entry >(.*)</a>f.*", string) def spider(): ''' 爬虫的主调度器 ''' isflow = True # 判断是否进行下一页 page = 1 while isflow: url="http://www.dianzixuexi.com" #url = "http://www.dianzixuexi.com/article/list_5_" + str(page) + ".html" print(url) html = load(url) deal(html, page) panduan = input("是否继续(y/n)!") if panduan == "y": isflow = True page += 1 else: isflow = False def load(url): header = { "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36" } request = urllib.request.Request(url, headers=header) response = urllib.request.urlopen(request) html = response.read() return html def deal(html,page): ''' 对之前爬去的内容进行正则匹配,匹配出标题和正文内容 :param html:之前爬去的内容 :param page: 正在爬去的页码 ''' #python3编码和python2不一样,解决编译时cannot use a string pattern on a bytes-like object问题 encode_type = chardet.detect(html) html = html.decode(encode_type['encoding']) # 进行相应解码,赋给原标识符(变量) #parrten=re.compile('<li class="piclist\d+">(.*?)</li>',re.S) parrten = re.compile('">(.*?)</a>', re.S) titleList=parrten.findall(html) for title in titleList: parrten1=re.compile('<a href="/article/\d+.html" rel="external nofollow" >(.*)</a>') ti1=parrten1.findall(title) parrten2=re.compile('<div class="f18 mb20">(.*?)</div>',re.S) til2=parrten2.findall(title) for t in ti1: tr=t.replace("<b>","").replace("</b>","") writeData(tr,page) for t in til2: tr=t.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo","\"").replace("&rdquo","\"") writeData(tr,page) def writeData(context,page): ''' 将最终爬去的内容写入文件中 :param context: 匹配好的内容 :param page: 当前爬去的页码数 ''' fileName = "di" + str(page) + "yehtml.txt" with open(fileName, "a") as file: file.writelines(context + "\n") for x in result: print(x) # resp = urllib.request.urlopen('http://www.baidu.com') html2=load('http://www.dianzixuexi.com') print (html2) #deal(html2,1) spider()
查看评论 回复
"python爬虫爬取网页取得html并处理数据写入文件"的相关文章
- 上一篇:【编辑推荐】程序员修神之路--高并发优雅的做限流
- 下一篇:没有了