Python爬虫——实战三:爬取苏宁易购的商品价格
程序员文章站
2024-01-30 18:09:16
苏宁易购的商品价格请求URL为
https://pas.suning.com/nspcsale_0_000000000152709847_000000000152709847...
苏宁易购的商品价格请求URL为
https://pas.suning.com/nspcsale_0_000000000152709847_000000000152709847_0000000000_10_010_0100101_20268_1000000_9017_10106_Z001___R0104002_46.5_0_.html?callback=pcData&_=1503023077300
价格在返回的数据中结构为:pcData[0]–>data–>price–>saleInfo–>0–>promotionPrice。
但是上面的URL很难看出是什么结构,并且也不知道需要的信息应该从哪里获得。所以直接访问URL这种方法不具备通用性。
""" from PySide.QtGui import * from PySide.QtWebKit import * from PySide.QtCore import * """ from PySide import * import time class BrowserRender(QWebView): def __init__(self, show=True): self.app = QApplication(sys.argv) QWebView.__init__(self) if show: self.show() def download(self, url, timeout=60): loop = QEventLoop() timer = QTimer() timer.setSingleShot(True) timer.timeout.connect(loop.quit) self.loadFinished.connect(loop.quit) self.load(QUrl(url)) timer.start(timeout * 1000) loop.exec_() if timer.isActive(): timer.stop() return self.html() else: print "Request time out: " + url def html(self): return self.page().mainFrame().toHtml() def find(self, pattern): return self.page().mainFrame().findAllElements(pattern) def attr(self, pattern, name, value): for e in self.find(pattern): e.setAttribute(name, value) def text(self, pattern, value): for e in self.find(pattern): e.setPlainText(value) def click(self, pattern): for e in self.find(patter): e.evaluateJavaScript("this.click()") """ def wait_load(self, pattern, timeout=60): deadline = time.time() + timeout while time.time() < deadline: self.app.processEvents() matches = self.find(pattern) if matches: return matches print "wait load time out" """ 9 if __name__=="__main__": br = BrowserRender() br.download("https://product.suning.com/0000000000/152709847.html?srcpoint=index3_homepage1_32618213038_prod02") price = br.find("span.mainprice" ) print price[0].toPlainText().strip()