欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页  >  IT编程

Python-爬虫小计

程序员文章站 2022-04-14 17:04:04
1 # -*-coding:utf8-*- 2 import requests 3 from bs4 import BeautifulSoup 4 import time 5 import os 6 import urllib 7 import re 8 import json 9 10 11 re... ......
 1 # -*-coding:utf8-*-
 2 import requests
 3 from bs4 import beautifulsoup
 4 import time
 5 import os
 6 import urllib
 7 import re
 8 import json
 9 
10 
11 requests.packages.urllib3.disable_warnings()
12 
13 proxies = {"http": "http:.............................",
14            "https": "https:............................."}
15 headers = {
16     'user-agent': 'mozilla/5.0 (windows nt 6.1) applewebkit/537.36 (khtml, like gecko) chrome/59.0.3071.115 safari/537.36'
17 }
18 
19 def get_bs(url):
20     res = requests.get(url, proxies=proxies, headers=headers, verify=false)
21     bs = beautifulsoup(res.content, 'lxml')
22     return bs
23 
24 def get_first_url():
25     first_url_list = []
26     page = 213
27     for i in range(page):
28         root_url =  "https://www.model61.com/mold.php?page={}".format(str(i+1))
29         bs = get_bs(root_url)
30         for i in  bs.select("dt a"):
31             src = i.get('href')
32             if "php" in src:
33                 first_url = "https://www.model61.com/{}".format(src)
34                 first_url_list.append(first_url)
35     return first_url_list
36 
37 
38 
39 if __name__ == '__main__':
40     get_first_url()

 

上一篇: redux

下一篇: 关于c++11中的thread库