源码说明
爬取彼岸桌面美女系列指定页图片 新手写的第一个爬虫,不足之处可以指出,感谢。
import os import re import time import requests from bs4 import BeautifulSoup n=1 headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36 Edg/92.0.902.78", "Connection":"close"} k=input("你要爬取第几页") url1=f'http://www.netbian.com/meinv/index_{k}.htm' url2='http://www.netbian.com' resp=requests.get(url1,headers=headers) resp.encoding='gbk' html=resp.text main_page=BeautifulSoup(html,"html.parser") alist=main_page.find("div",attrs={"class":"list"}).find_all("a",attrs={"target":"_blank"}) url=re.findall('<a href="(.*?)" title=".*?" target="_blank">',html) #从首页中获取子页面的路径 del(url[0]) url.pop() j=0 for j in range(19): url3=url2+url[j] j+=1 resp2 = requests.get(url3, headers=headers) resp2.encoding='gbk' html2=resp2.text child_page=BeautifulSoup(html2,"html.parser") clist=child_page.find("div",attrs={"class":"pic"}).find_all("img") for q in clist: q1=q.get("src") #获取下载链接 if not os.path.exists('4k壁纸%s' % k): os.mkdir(f'./4k壁纸%s' % k) f = open(f'./4k壁纸%s/' % k + "pic_%s.jpg" % n, mode="wb") tu = requests.get(q1,headers=headers) tu.close() f.write(tu.content) time.sleep(1) print("下载了%s张壁纸" % n) n+=1
已有 3048 人阅读