python爬取course课程的信息
程序员文章站
2023-03-26 19:23:07
@[toc] 这几天爬取了course动态网页的课程信息,有关 数据分析 , 机器学习 ,还有 概率论和数理统计 课程,这里数据过多,只对 数据分析 的模块进行了详细爬取。为之后的数据分析以及机器学习情感处理进行数据获取,下面直接上代码和截图: 1.大模块页面 页面如下: 爬取代码: 结果如下: 2 ......
目录
@
这几天爬取了course动态网页的课程信息,有关数据分析,机器学习,还有概率论和数理统计课程,这里数据过多,只对数据分析的模块进行了详细爬取。为之后的数据分析以及机器学习情感处理进行数据获取,下面直接上代码和截图:
1.大模块页面
页面如下:
爬取代码:
# encoding: utf-8 from selenium import webdriver import csv class getmodel(): ''' 类功能介绍:获取每页每个大模块的类型,标题,url 类成员变量:browser对象,每页的url 类成员函数:初始化函数__init__,获取每个大模块信息的函数geteverymodel,写入csv的函数savetocsv ''' def __init__(self,url): self.__browser = webdriver.chrome() self.__page_url = url def savetocsv(self,this_list): with open("model.csv", "a", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') writer.writerow(this_list) def geteverymodel(self): # 隐式等待 self.__browser.implicitly_wait(30) self.__browser.get(self.__page_url) model_type = self.__browser.find_elements_by_css_selector(".product-type-row.horizontal-box") model_title = self.__browser.find_elements_by_css_selector(".color-primary-text.card-title.headline-1-text") model_url = self.__browser.find_elements_by_css_selector(".rc-desktopsearchcard.anchor-wrapper.browse-result-card") for i in range(len(model_type)): every_model = [] every_model.append(model_type[i].text) every_model.append(model_title[i].text) every_model.append(model_url[i].get_attribute('href')) self.savetocsv(every_model) def closeurl(self): self.__browser.close()
结果如下:
2.每个大模块中小模块的简单信息
页面如下:
查看全部的信息要先模拟点击查看所有页面:
代码如下:
# encoding: utf-8 from selenium import webdriver import csv import time class getcourse(): ''' 类功能介绍:获取每个大模块的小课程的信息 类成员变量:browser对象,每个大模块的url,大模块类别,大模块标题 类成员函数:初始化函数__init__,获取每个小课程信息的函数geteverycourse,写入csv的函数savetocsv ''' def __init__(self,url,model_type,model_title): self.__browser = webdriver.chrome() self.__page_url = url self.__model_type = model_type self.__model_title = model_title def savetocsv(self,this_list): with open("course2.csv", "a", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') writer.writerow(this_list) def geteverycourse(self): self.__browser.get(self.__page_url) # 隐式等待 self.__browser.implicitly_wait(15) try: button = self.__browser.find_element_by_css_selector(".button_1w8tm98-o_o-default_9vdknu-o_o-md_1jvotax.m-t-1.d-block.m-x-auto") button.click() except: print("没有多余项") finally: self.__browser.implicitly_wait(20) course_title = self.__browser.find_elements_by_css_selector(".h2_1pmnvep-o_o-weightbold_uvlhiv-o_o-bold_1byw3y2.m-b-2") # course_url = self.__browser.find_elements_by_css_selector(".col_i9j08c-o_o-xscol12_1m1ceo5-o_o-mdcol10_1eb21lj-o_o-lgcol10_ra5osh.p-b-3.border-bottom a") course_score = self.__browser.find_elements_by_css_selector(".h4_1k76nzj-o_o-weightbold_uvlhiv-o_o-bold_1byw3y2.m-l-1s") commet_num = self.__browser.find_elements_by_css_selector(".p_gjs17i-o_o-weightnormal_s9jwp5-o_o-fontbody_56f0wi.m-r-1s") comment_number = self.__browser.find_elements_by_css_selector(".reviewscount") for i in range(len(course_title)): every_course = [] every_course.append(course_title[i].text) every_course.append(course_score[i].text[0:4]) every_course.append(commet_num[i].text) every_course.append(comment_number[i].text) every_course.append(self.__model_type) every_course.append(self.__model_title) self.savetocsv(every_course) def closeurl(self): self.__browser.close()
结果如下:
3.每个小课程的详细信息
页面如下:
代码如下:
# encoding: utf-8 from selenium import webdriver import csv class getdetail(): ''' 类功能介绍:获取每个课程的详细信息 类成员变量:browser对象,每个课程的之前已爬取的信息 类成员函数:初始化函数__init__,获取每个课程详细信息的函数geteverydetail,写入csv的函数savetocsv ''' def __init__(self,title,url,score,mark_people_num,comment_people_num,model_type,model_title): self.__browser = webdriver.chrome() self.__page_title = title self.__page_url = url self.__score = score self.__mark_people_num = mark_people_num self.__comment_people_num = comment_people_num self.__model_type = model_type self.__model_title = model_title def savetocsv(self,this_list): with open("course_detail.csv", "a", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') writer.writerow(this_list) def geteverydetail(self): self.__browser.get(self.__page_url) # 隐式等待 self.__browser.implicitly_wait(20) course_registered_num = self.__browser.find_element_by_css_selector(".enrolledlargefont_16g5ucx span strong span").text course_provide = self.__browser.find_element_by_css_selector(".partnerbanner_np2ice-o_o-box_120drhm-o_o-displayflex_poyjc div img").get_attribute("title") course_information_view_num = self.__browser.find_element_by_css_selector(".viewswithtextonly_1fs65xr span span").text course_information = self.__browser.find_element_by_css_selector(".content-inner").text what_we_learned = self.__browser.find_elements_by_css_selector(".col_i9j08c-o_o-xscol12_1m1ceo5-o_o-mdcol6_1rbv01c.m-b-1") what_we_learned_result = '' for i in range(len(what_we_learned)): what_we_learned_result += what_we_learned[i].text acquired_skills = self.__browser.find_elements_by_css_selector(".pill_56iw91.m-r-1s.m-b-1s") acquired_skills_result = '' for i in range(len(acquired_skills)): acquired_skills_result += acquired_skills[i].text acquired_skills_result += ' ' course_evaluation = self.__browser.find_elements_by_css_selector(".quote.caption-text") course_evaluation_result = '' for i in range(len(course_evaluation)): course_evaluation_result += course_evaluation[i].text course_evaluation_result += ' ' all_one = self.__browser.find_elements_by_css_selector(".h4_1k76nzj-o_o-weightbold_uvlhiv-o_o-bold_1byw3y2.m-b-0") all_two = self.__browser.find_elements_by_css_selector(".font-sm.text-secondary") finish_time = all_one[3].text advice_time = all_two[2].text course_language = all_one[4].text subtitle_language = all_two[3].text course_lecturer = self.__browser.find_elements_by_css_selector(".link-no-style") course_lecturer_result = '' for i in range(len(course_lecturer)): course_evaluation += course_lecturer[i].text course_lecturer_result += ' ' every_course_detail = [] every_course_detail.append(self.__page_title) every_course_detail.append(self.__page_url) every_course_detail.append(self.__score) every_course_detail.append(self.__mark_people_num) every_course_detail.append(self.__comment_people_num) every_course_detail.append(self.__model_type) every_course_detail.append(self.__model_title) every_course_detail.append(course_registered_num) every_course_detail.append(course_provide) every_course_detail.append(course_information_view_num) every_course_detail.append(course_information) every_course_detail.append(what_we_learned_result) every_course_detail.append(acquired_skills_result) every_course_detail.append(course_evaluation_result) every_course_detail.append(finish_time) every_course_detail.append(advice_time) every_course_detail.append(course_language) every_course_detail.append(subtitle_language) self.savetocsv(every_course_detail) def closeurl(self): self.__browser.close()
结果如下:
4.爬取所有评论
我把所有的小课程爬取下来,并进行了排名,取前六个如下:
然后对这六个课程进行了所有评论的爬取。代码如下:
# encoding: utf-8 from selenium import webdriver import csv import pandas as pd import re import os.path def savetocsv(url_title,this_list): with open("{0}_review.csv".format(url_title), "a", newline='', encoding='utf-8') as f: writer = csv.writer(f, delimiter=',') writer.writerow(this_list) def get_review(url_title,i): ''' :param url_title : 课程标题 ''' try: url = 'https://www.coursera.org/learn/' + url_title + '/reviews'+ '?page=' + str(i) browser = webdriver.chrome() browser.get(url) # 隐式等待 browser.implicitly_wait(20) every_view = browser.find_elements_by_css_selector(".row_nvwp6p.review.review-page-review.m-b-2") n = 1 for i in every_view: s = i.find_elements_by_css_selector(".starrating_1qk9an0-o_o-noneditable_1ko0lno label") # 评星 evaluation_star = 0 for j in s: if j.text == 'filled star': evaluation_star = evaluation_star + 1 elif j.text == 'half faded star': evaluation_star = evaluation_star + 0.5 # 作者和时间 author = i.find_element_by_css_selector(".reviewername.p-x-1s.m-b-0.text-secondary.font-xs").text[4:] time = i.find_element_by_css_selector(".dateofreview.p-x-1s.m-b-0.text-secondary.font-xs").text # 评论 reviewtextall = i.find_elements_by_css_selector(".reviewtext div div p") ss = [] for j in reviewtextall: ss.append(j.text) reviewtext = '@@'.join(ss) # 点赞数 zan = i.find_element_by_css_selector(".col_i9j08c-o_o-mdcol4_cuxg7k.e2e-helpful-button-col").text if zan == '这些很有用:': like_num = 0 else: number = re.findall(r"\d+\.?\d*",zan) like_num = int(number[0]) list_data = [] list_data.append(evaluation_star);list_data.append(author);list_data.append(time);list_data.append(reviewtext);list_data.append(like_num) savetocsv(url_title, list_data) n = n + 1 browser.close() except: print("出错,继续获取") get_review(url_title, i) def get_result(url_title,page_num): # 控制页数 for i in range(1,page_num+1): try: get_review(url_title,i) # 爬取第i页的内容 print("*********课程{course}的第{num}页爬取完毕".format(course=url_title, num=i)) except exception as ex: print("*********课程{course}的第{num}页爬取出错-{ex}".format(course = url_title,num = i,ex=ex)) def chucuo(k,v): file_name = "{0}_review.csv".format(k) # 文件标题 csv_head = ['evaluation_star', 'author', 'time', 'reviewtext', 'like_num'] # 写入文件标题 with open(file_name, "a", newline='', encoding='utf-8') as f: file_empty = os.stat(file_name).st_size == 0 writer = csv.writer(f, delimiter=',') if file_empty: # 防止因程序重复运行而写入重复的标题 writer.writerow(csv_head) get_result(k, v) def main(): url_title = {'data-scientists-tools':165,'what-is-datascience':136,'r-programming':124,'python-data-analysis':119,'open-source-tools-for-data-science':60,'python-for-applied-data-science-ai':54} for k,v in url_title.items(): chucuo(k, v) if __name__=='__main__': main()
结果如下:
对比原网页如下:
注:多段评论用@@隔开,如下: