c++爬虫大众点评数据
程序员文章站
2022-06-23 16:34:57
#include < curl / curl.h >
#include < iostream >
#include < stdio.h >...
#include < curl / curl.h > #include < iostream > #include < stdio.h > #include < string.h > #include < pcre.h > #define oveccount 30 /* should be a multiple of 3 */ #define ebuflen 128 #define buflen 10240 using namespacestd; size_t onwritedata(void * buffer, size_t size, size_t nmemb, void * str) { if (!str || !buffer) { return - 1; } string * result = (string * ) str; result - >append((char * ) buffer, size * nmemb); return nmemb; } //获取页面 int getweb(string url, string & result) { long code = 0; string htmlpage; curl * curl = curl_easy_init(); curl_easy_setopt(curl, curlopt_url, url.c_str()); //设置url curl_easy_setopt(curl, curlopt_post, 0); //设置请求方法 curl_easy_setopt(curl, curlopt_useragent, "mozilla/5."); //伪装客户端 curl_easy_setopt(curl, curlopt_writedata, &htmlpage); //设置接受返回结果字符串 curl_easy_setopt(curl, curlopt_writefunction, onwritedata); //设置处理方法 curl_easy_perform(curl); //请求 curl_easy_getinfo(curl, curlinfo_response_code, &code); if (code == 200) { cout << "request success" << endl; result = htmlpage; //cout<<htmlpage<<endl; } curl_easy_cleanup(curl); return code; } int main(int argc, char * *argv) { pcre * re; constchar * error; int erroffset; int ovector[oveccount]; int rc, i; string url = "https://www.dianping.com/search/category/212/10/g103"; string html; getweb(url, html); //char src[] = " "; //char pattern[] = "(<a>.+?</a>)"; constchar * src = html.c_str(); char pattern[] = "(<li class=\"\"[\\s\\s]*?</li>)"; printf("string : %s\n", src); printf("pattern: \"%s\"\n", pattern); re = pcre_compile(pattern, 0, &error, &erroffset, null); if (re == null) { printf("pcre compilation failed at offset %d: %s\n", erroffset, error); return1; } char * p = (char * ) src; while ((rc = pcre_exec(re, null, p, strlen(p), 0, 0, ovector, oveccount)) != pcre_error_nomatch) { printf("\nok, %d matched ...\n\n", rc); for (i = 0; i < rc - 1; i++) { char * substring_start = p + ovector[2 * i]; int substring_length = ovector[2 * i + 1] - ovector[2 * i]; char matched[10240]; memset(matched, 0, 10240); strncpy(matched, substring_start, substring_length); printf("match:%s\n", matched); } p += ovector[1]; if (!p) { break; } } pcre_free(re); return0; }
上一篇: 青海湖二郎剑景区好玩吗 151基地怎么样
下一篇: 上班族最容易瘦腹的食物