python多线程百度url采集器

程序员文章站 2022-04-18 20:54:17

...


import requests
import sys
import re
from bs4 import BeautifulSoup as bs
from Queue import Queue
import threading
import os
			
class urlcollecter(threading.Thread):
	def __init__(self,queue):
		threading.Thread.__init__(self)
		self.__queue=queue
		
	def run(self):
		while not self.__queue.empty():
			url=self.__queue.get()
			self.spider(url)
			
	def spider(self,url):
		headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}
		r=requests.get(url=url,headers=headers)
		html=r.content
		soup=bs(html,'lxml')
		links=soup.find_all(name='a',attrs={'data-click':re.compile('.'),'class':None})
		for link in links:
			url_real=link['href']
			try:
				r_link_real=requests.get(url=url_real,headers=headers,timeout=8)
				if r_link_real.status_code==200:
					print r_link_real.url
			except Exception,e:
				print e
				pass

				
def main(keyword):
	queue=Queue()
	threads=[]
	thread_num=4
	
	for i in range(0,760,10):
		urls='https://www.baidu.com/s?wd=%s&pn=%s'%(keyword,str(i))
		queue.put(urls)
		
	for t in range(thread_num):
		t=urlcollecter(queue)
		threads.append(t)
		
	for i in threads:
		i.start()
		
	for i in threads:
		i.join()
		

if __name__=='__main__':
	if len(sys.argv)!=2:
		print "Enter %s keyword" %sys.argv[0]
		sys.exit(-1)
	else:
		main(sys.argv[1])

python多线程百度url采集器

Python实现的百度站长自动URL提交小工具

python采集百度搜索结果带有特定URL的链接代码实例

Python实现的百度站长自动URL提交小工具

python 利用百度API识别图片文字（多线程版）

Python多线程结合队列下载百度音乐的方法

python采集百度搜索结果带有特定URL的链接代码实例

Python多线程结合队列下载百度音乐的方法

Python实现的百度站长自动URL提交小工具

python多线程百度url采集器

使用 Python 编写多线程爬虫抓取百度贴吧邮箱与手机号