@远方很远 from multiprocess import Process
import time
def func():
for i in range(10):
print( 子 ,i)
time.sleep(1)
if __name__== __main__ :
p=Process(target=func)
p.start()
for i in range(10):
print( 主 ,i)
time.sleep(1)
为什么开的这么难
你复制我得看看
我觉得我的模块有问题
import requests import re import json import os from requests.exceptions import RequestException headers = { User-Agent : Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36 , Referer : https://maoyan.com , Cookie : __mta=150257545.1631492828404.1631494019461.1631494128206.11; uuid_n_v=v1; uuid=527644E0142911ECB55CBB305D2AA0B10206AFBAB4CB475088C931781076E1FA; _csrf=5e4a62b4751f4510de197b6bc0c95135582879125c178b5ff5f2ebd94019dba5; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1631492828; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=17bdc8cea2dc8-00c198070d82d5-3e604809-15f900-17bdc8cea2ec8; _lxsdk=527644E0142911ECB55CBB305D2AA0B10206AFBAB4CB475088C931781076E1FA; __mta=150257545.1631492828404.1631492828404.1631492831171.2; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1631494128; _lxsdk_s=17bdc8cea2f-7eb-b35-003%7C%7C26 } # 对单页的网页源代码进行爬取 def get_one_page(url, headers): try: response = requests.get(url, headers=headers)#获取网页 if response.status_code == 200: #确认网页信息是不是200,200是正常的 return response.text return None except RequestException: return None #解析这个网页 #正则表达式提取信息 def parse_one_page(html): pattern = re.compile( dd .*?board-index.*? (\d+) /i .*?data-src= (.*?) .*?name a + .*? (.*?) /a .*?star (.*?) /p .*?releasetime (.*?) /p .*?integer (.*?) /i .*?fraction (.*?) /i .*? /dd , re.S) items = re.findall(pattern, html) for item in items: yield { index : item[0], image : item[1], title : item[2], actor : item[3].strip()[3:], time : item[4].strip()[5:], score : item[5] + item[6] } # 将信息写入文件 def write_to_file(content): # encoding = utf-8 ,ensure_ascii =False,使写入文件的代码显示为中文 with open( result.txt , a , encoding= utf-8 ) as f: f.write(json.dumps(content, ensure_ascii=False) + \n ) f.close() # 下载电影封面 def save_image_file(url, path): jd = requests.get(url) if jd.status_code == 200: with open(path, wb ) as f: f.write(jd.content) f.close() #主函数 def main(offset): url = https://maoyan.com/board/4?offset= + str(offset) html = get_one_page(url, headers) if not os.path.exists( movieImgs ): os.mkdir( movieImgs ) for item in parse_one_page(html): print(item) write_to_file(item) save_image_file(item[ image ], movieImgs/ + item[ title ] + .jpg ) if __name__ == __main__ : # 对每一页信息进行爬取 for i in range(10): main(i*10)
那我应该从哪里加,不知道从哪继续爬
正则建议全部换了,用lxml
一般我用的
r1=requests.get(url=urls,headers=headers(),verify=False) r1=r1.content.decode( gbk ,errors= ignore ) obj1=etree.HTML(r1) soup=BeautifulSoup(r1, lxml ) 标题=obj1.xpath( //*[@id= content ]/div[1]/h1/text() )
标题就直接出来了
nexturls=obj1.xpath( //*[@id= content ]/div[1]/div/a[3]/@href ) nexturls=starturl+nexturls[0] r2=requests.get(url=nexturls,headers=headers(),verify=False) r2=r2.content.decode( gbk ,errors= ignore )
开始第二层爬取,另外单个爬取效率太低了,建议建立header池
外加使用多线程,我爬个小说用了一个多小时
我的妈呀哈哈哈,我得好好研究了
应数专业已经哭了
还要建立UA池,代理IP池