python多线程百度url采集器
程序员文章站
2022-04-18 20:54:17
...
import requests
import sys
import re
from bs4 import BeautifulSoup as bs
from Queue import Queue
import threading
import os
class urlcollecter(threading.Thread):
def __init__(self,queue):
threading.Thread.__init__(self)
self.__queue=queue
def run(self):
while not self.__queue.empty():
url=self.__queue.get()
self.spider(url)
def spider(self,url):
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}
r=requests.get(url=url,headers=headers)
html=r.content
soup=bs(html,'lxml')
links=soup.find_all(name='a',attrs={'data-click':re.compile('.'),'class':None})
for link in links:
url_real=link['href']
try:
r_link_real=requests.get(url=url_real,headers=headers,timeout=8)
if r_link_real.status_code==200:
print r_link_real.url
except Exception,e:
print e
pass
def main(keyword):
queue=Queue()
threads=[]
thread_num=4
for i in range(0,760,10):
urls='https://www.baidu.com/s?wd=%s&pn=%s'%(keyword,str(i))
queue.put(urls)
for t in range(thread_num):
t=urlcollecter(queue)
threads.append(t)
for i in threads:
i.start()
for i in threads:
i.join()
if __name__=='__main__':
if len(sys.argv)!=2:
print "Enter %s keyword" %sys.argv[0]
sys.exit(-1)
else:
main(sys.argv[1])