android Jsoup获取网站内容 android获取新闻标题实例

程序员文章站 2022-06-14 09:33:55

近期做简单的新闻客户端界面使用到了jsoup获取，使用起来特别方便，这也是被我一个学长称为学android网络必学的一个东西，在此也是分享一下自己近期所学。首先还是给出...

近期做简单的新闻客户端界面使用到了jsoup获取，使用起来特别方便，这也是被我一个学长称为学android网络必学的一个东西，在此也是分享一下自己近期所学。

首先还是给出效果：

上面是通过textview显示的一个从网站上获取的所有内容的显示，下面是通过listview显示一下获取的新闻的标题，如此显示比较便于理解。

mainactivity：

import android.app.activity; 
import android.content.intent; 
import android.net.uri; 
import android.os.asynctask; 
import android.os.bundle; 
import android.text.method.scrollingmovementmethod; 
import android.util.log; 
import android.view.menu; 
import android.view.view; 
import android.widget.adapterview; 
import android.widget.adapterview.onitemclicklistener; 
import android.widget.arrayadapter; 
import android.widget.listview; 
import android.widget.textview; 
 
import org.apache.http.httpresponse; 
import org.apache.http.client.httpclient; 
import org.apache.http.client.methods.httppost; 
import org.apache.http.impl.client.defaulthttpclient; 
import org.apache.http.util.entityutils; 
import org.jsoup.jsoup; 
import org.jsoup.nodes.document; 
import org.jsoup.nodes.element; 
import org.jsoup.select.elements; 
 
import java.util.arraylist; 
import java.util.list; 
 
@suppresswarnings("unused") 
public class mainactivity extends activity { 
  private textview tv_htmlcode; 
  //此处搞一个textview主要来显示news列表里面存储的内容，仅仅便于分析和理解 
 
  private string url_eol = "http://www.cnwust.com/newslist/1_1", 
      tag = "atag"; 
  //这是索要获取内容的网址 
 
  private list<news> newslist; 
  //自定义的news的类，用于存放索要获取新闻的目录、时间以及点击后显示的网址 
 
  private listview lv_result; 
  private arrayadapter<string> lv_adapter; 
 
  @override 
  protected void oncreate(bundle savedinstancestate) { 
    super.oncreate(savedinstancestate); 
    setcontentview(r.layout.activity_main); 
    lv_result = (listview) findviewbyid(r.id.lv_result); 
    tv_htmlcode = (textview) findviewbyid(r.id.tv_htmlcode); 
    tv_htmlcode.setmovementmethod(scrollingmovementmethod.getinstance()); 
 
    connecttask c1 = new connecttask(); 
    c1.execute(); 
 
  } 
 
  @override 
  public boolean oncreateoptionsmenu(menu menu) { 
    getmenuinflater().inflate(r.menu.menu_main, menu); 
    return true; 
  } 
 
  public class connecttask extends asynctask<void, void, string> { 
 
    @override 
    protected string doinbackground(void... params) { 
      string result = connecteol(); 
      return result; 
    } 
 
    @override 
    protected void onpostexecute(string result) { 
      // tv_htmlcode.settext(result); 
      newslist = getnews(result); 
      list<string> newstitles = new arraylist<string>(); 
      for (news news : newslist) { 
        tv_htmlcode.append(news.getnewstitle() + "\n"); 
        tv_htmlcode.append(news.getnewstime() + "\n"); 
        tv_htmlcode.append(news.getnewsurl() + "\n"); 
        newstitles.add(news.getnewstitle()); 
      } 
    /* 为listview添加适配器 */ 
 
      lv_adapter = new arrayadapter<string>(mainactivity.this, 
          android.r.layout.simple_list_item_1, newstitles); 
      lv_result.setadapter(lv_adapter); 
 
    /* 为listview添加点击打开对应网页功能 */ 
      lv_result.setonitemclicklistener(new onitemclicklistener() { 
 
        @override 
        public void onitemclick(adapterview<?> arg0, view arg1, 
                    int arg2, long arg3) { 
          final uri uri = uri.parse(newslist.get(arg2).getnewsurl()); 
          final intent it = new intent(intent.action_view, uri); 
          startactivity(it); 
        } 
 
      }); 
      //此处为了方便就点击就直接调用设备默认浏览器打开网址 
 
      super.onpostexecute(result); 
 
 
    } 
 
  } 
 
  /* 连接eol的方法 返回整个网页经过截取之后的的源代码 */ 
  public string connecteol() { 
    string result = ""; 
    try { 
      httpclient httpclient = new defaulthttpclient(); 
      httppost httppost = new httppost(url_eol); 
      httpresponse response = httpclient.execute(httppost); 
      string res = entityutils.tostring(response.getentity(), "utf-8"); 
 
      int st = res.indexof("<div id=\"result\">"); 
      int ed = res.indexof("<div id=\"pager\">"); 
      //这边算是最重要的部分，代码获取的便是这两段之间的部分。 
 
      string content = res.substring(st, ed); 
      st = content.indexof("<ul>") + 4; 
      ed = content.indexof("</ul>"); 
      content = content.substring(st, ed); 
      result = content; 
    } catch (exception e) { 
      log.d(tag, e.tostring()); 
    } 
    return result; 
  } 
 
  /* 对源代码进行解析截取的方法 返回一个news数组 */ 
  public list<news> getnews(string htmlcode) { 
    list<news> newslist = new arraylist<news>(); 
    document doc = jsoup.parse(htmlcode); 
    log.d(tag, "解析html中"); 
    elements lis = doc.getelementsbytag("li"); 
    log.d(tag, "lis的size " + lis.size()); 
    for (element li : lis) { 
      string newstime = li.getelementsbytag("span").text(); 
      string newstitle = li.getelementsbytag("a").text(); 
      string newsurl = li.getelementsbytag("a").attr("href"); 
      //这三段算是jsoup从html中获取内容的关键了，很容易理解。 
 
      newsurl = newsurl.replace("/news", "http://www.cnwust.com/news"); 
      //直接从html的代码中获取的url是相对路径，此处使用replace改为绝对路径 
 
      log.d(tag, newstime); 
      log.d(tag, newstitle); 
      log.d(tag, newsurl); 
 
      news newst = new news(); 
      newst.setnewstime(newstime); 
      newst.setnewstitle(newstitle); 
      newst.setnewsurl(newsurl); 
      newslist.add(newst); 
    } 
    return newslist; 
  } 
}

news：

public class news { 
  private string newstime; 
  private string newsurl; 
  private string newstitle; 
 
  public news() { 
 
  } 
 
  public news(string newstitle, string newstime, string newsurl) { 
    this.newstime = newstime; 
    this.newsurl = newsurl; 
    this.newstitle = newstitle; 
  } 
 
  public string getnewstime() { 
    return newstime; 
  } 
 
  public void setnewstime(string newstime) { 
    this.newstime = newstime; 
  } 
 
  public string getnewsurl() { 
    return newsurl; 
  } 
 
  public void setnewsurl(string newsurl) { 
    this.newsurl = newsurl; 
  } 
 
  public string getnewstitle() { 
    return newstitle; 
  } 
 
  public void setnewstitle(string newstitle) { 
    this.newstitle = newstitle; 
  } 
 
}

activity_main:

<relativelayout xmlns:android="http://schemas.android.com/apk/res/android" 
  xmlns:tools="http://schemas.android.com/tools" 
  android:layout_width="match_parent" 
  android:layout_height="match_parent" 
  android:paddingbottom="@dimen/activity_vertical_margin" 
  android:paddingleft="@dimen/activity_horizontal_margin" 
  android:paddingright="@dimen/activity_horizontal_margin" 
  android:paddingtop="@dimen/activity_vertical_margin" 
  tools:context=".newslist" > 
 
  <textview 
    android:id="@+id/tv_htmlcode" 
    android:layout_width="match_parent" 
    android:layout_height="150dp" 
    android:layout_above="@+id/lv_result" 
    android:layout_alignparenttop="true" 
    android:layout_centerhorizontal="true" 
    android:scrollbars="vertical" /> 
 
  <listview 
    android:id="@+id/lv_result" 
    android:layout_width="match_parent" 
    android:layout_height="230dp" 
    android:layout_alignleft="@+id/tv_htmlcode" 
    android:layout_alignparentbottom="true" > 
  </listview> 
 
</relativelayout>

此处对html代码的解析可能部分新手还是不太清楚，在此也是建议使用chrome浏览器，可以直接查看网站的源码。（有部分加密的网站看不到）下面看一下具体使用的截图：

1、首先先要打开到你要获取内容的网站

android Jsoup获取网站内容 android获取新闻标题实例