欢迎您访问程序员文章站本站旨在为大家提供分享程序员计算机编程知识!
您现在的位置是: 首页

python多线程百度url采集器

程序员文章站 2022-04-18 20:54:17
...

import requests
import sys
import re
from bs4 import BeautifulSoup as bs
from Queue import Queue
import threading
import os
			
class urlcollecter(threading.Thread):
	def __init__(self,queue):
		threading.Thread.__init__(self)
		self.__queue=queue
		
	def run(self):
		while not self.__queue.empty():
			url=self.__queue.get()
			self.spider(url)
			
	def spider(self,url):
		headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}
		r=requests.get(url=url,headers=headers)
		html=r.content
		soup=bs(html,'lxml')
		links=soup.find_all(name='a',attrs={'data-click':re.compile('.'),'class':None})
		for link in links:
			url_real=link['href']
			try:
				r_link_real=requests.get(url=url_real,headers=headers,timeout=8)
				if r_link_real.status_code==200:
					print r_link_real.url
			except Exception,e:
				print e
				pass

				
def main(keyword):
	queue=Queue()
	threads=[]
	thread_num=4
	
	for i in range(0,760,10):
		urls='https://www.baidu.com/s?wd=%s&pn=%s'%(keyword,str(i))
		queue.put(urls)
		
	for t in range(thread_num):
		t=urlcollecter(queue)
		threads.append(t)
		
	for i in threads:
		i.start()
		
	for i in threads:
		i.join()
		

if __name__=='__main__':
	if len(sys.argv)!=2:
		print "Enter %s keyword" %sys.argv[0]
		sys.exit(-1)
	else:
		main(sys.argv[1])
相关标签: Python实战