python的使用:写csv文件、为爬虫添加代理ip、字典的相关用法
程序员文章站
2022-09-02 22:46:55
写csv文件
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup...
写csv文件
import csv from urllib.request import urlopen from bs4 import BeautifulSoup html = urlopen("https://en.wikipedia.org/wiki/Comparison_of_text_editors") bsObj = BeautifulSoup(html,"lxml") #主对比表格是当前页面的第一个表格 table = bsObj.findAll("table",{"class":"wikitable"})[0] rows = table.findAll("tr") csvFile = open("editors.csv","wt",newline = '',encoding = 'utf-8') writer = csv.writer(csvFile) try: for row in rows: csvRow = [] for cell in row.findAll(['td','th']): csvRow.append(cell.get_text()) writer.writerow(csvRow) finally: csvFile.close()
抓取页面图片①
import urllib.request response = urllib.request.urlopen('https://imgsrc.baidu.com/forum/w%3D580/sign=fdcdb5b2314e251fe2f7e4f09784c9c2/16391f30e924b89915f86eb06f061d950b7bf677.jpg') cat_img = response.read() with open('picture.jpg','wb')as f: f.write(cat_img)
抓取页面图片②
import urllib.request import re def getHtml(url): page = urllib.request.urlopen(url) html = page.read() return html def getImg(html): reg = r'src="(.+?\.jpg)" pic_ext' imgre = re.compile(reg) imglist = re.findall(imgre,html) x = 0 for imgurl in imglist: urllib.urlretrieve(imgurl,'%s.jpg' % x) x+=1 html = getHtml("https://tieba.baidu.com/p/2460150866") print(getImg(html))
为爬虫添加代理ip
import urllib.request import random url = 'https://whatismyip.com.tw' iplist = ['121.201.97.136:80','117.135.164.170:80','58.247.31.230:80'] proxy_support = urllib.request.ProxyHandler({'http':random.choice(iplist)}) opener = urllib.request.build_opener(proxy_support) urllib.request.install_opener(opener) response = urllib.request.urlopen(url) html = response.read().decode('utf-8') print(html) #获取页面内嵌链接 import requests import re from bs4 import BeautifulSoup from urllib.request import urlopen rawtext=urlopen("https://bbs.gfan.com/android-8397839-1-1.html").read() soup = BeautifulSoup(rawtext,"html.parser") targetDiv=soup.find('p',{'class':'pg'}) catalogLinks=targetDiv.find_all('a') indexlist = [] for l in catalogLinks[1:]: indexlist.append(l.get('href')) for index in indexlist: print(index)
字典的相关用法
test = { "post": { "content": "" }, "replys": [ { "content": "" } ] } test["post"]["content"] = "xx" test["replys"][0]["content"] = "yy" test["replys"][0]["value"] = "zz" test["replys"].append({"content":"","title":"","publish_date":""}) def store(measurements): import json with open('measurements.json', 'w') as f: f.write(json.dumps(test)) if __name__ == "__main__": store(test)