Python-爬虫小计
程序员文章站
2022-04-14 17:04:04
1 # -*-coding:utf8-*- 2 import requests 3 from bs4 import BeautifulSoup 4 import time 5 import os 6 import urllib 7 import re 8 import json 9 10 11 re... ......
1 # -*-coding:utf8-*- 2 import requests 3 from bs4 import beautifulsoup 4 import time 5 import os 6 import urllib 7 import re 8 import json 9 10 11 requests.packages.urllib3.disable_warnings() 12 13 proxies = {"http": "http:.............................", 14 "https": "https:............................."} 15 headers = { 16 'user-agent': 'mozilla/5.0 (windows nt 6.1) applewebkit/537.36 (khtml, like gecko) chrome/59.0.3071.115 safari/537.36' 17 } 18 19 def get_bs(url): 20 res = requests.get(url, proxies=proxies, headers=headers, verify=false) 21 bs = beautifulsoup(res.content, 'lxml') 22 return bs 23 24 def get_first_url(): 25 first_url_list = [] 26 page = 213 27 for i in range(page): 28 root_url = "https://www.model61.com/mold.php?page={}".format(str(i+1)) 29 bs = get_bs(root_url) 30 for i in bs.select("dt a"): 31 src = i.get('href') 32 if "php" in src: 33 first_url = "https://www.model61.com/{}".format(src) 34 first_url_list.append(first_url) 35 return first_url_list 36 37 38 39 if __name__ == '__main__': 40 get_first_url()
上一篇: redux
下一篇: 关于c++11中的thread库