Java获取http和https网址对应html数据实例
由于之前在公司一直用的c#做的软件开发,近些天有同学需要用java做一个从指定网址获取信息的java程序。正好不是很难,顺便复习了一下java的知识。
要求如下,在https://www.marinetraffic.com/en/ais/details/ships/shipid:650235/mmsi:414726000/vessel:yu%20ming网址中得到如下图框中标记的数据。
程序如下:getwebposition类是主要程序类
package yinhang.wang;
import java.io.bufferedreader;
import java.io.ioexception;
import java.io.inputstream;
import java.io.inputstreamreader;
import java.net.url;
import java.util.regex.matcher;
import java.util.regex.pattern;
import javax.net.ssl.httpsurlconnection;
import javax.net.ssl.sslcontext;
import javax.net.ssl.sslsocketfactory;
import javax.net.ssl.trustmanager;
public class getwebposition {
/**
* @param args
*/
public static void main(string[] args) {
// todo auto-generated method stub
string info = getdatabytwo();
system.out.println(info);
}
// 从指定的url中获取数据
//https://www.marinetraffic.com/en/ais/details/ships/shipid:650235/mmsi:414726000/vessel:yu%20ming
private static string httprequest(string requesturl) {
stringbuffer buffer = null;
bufferedreader bufferedreader = null;
inputstreamreader inputstreamreader = null;
inputstream inputstream = null;
httpsurlconnection httpurlconn = null;
// 建立并向网页发送请求
try {
trustmanager[] tm = { new myx509trustmanager() };
sslcontext sslcontext = sslcontext.getinstance("ssl", "sunjsse");
sslcontext.init(null, tm, new java.security.securerandom());
// 从上述sslcontext对象中得到sslsocketfactory对象
sslsocketfactory ssf = sslcontext.getsocketfactory();
url url = new url(requesturl);
// 描述状态
httpurlconn = (httpsurlconnection) url.openconnection();
httpurlconn.setsslsocketfactory(ssf);
httpurlconn
.setrequestproperty("user-agent", "mozilla/5.0 (windows nt 6.1; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/61.0.3163.100 safari/537.36)");
//防止报403错误。
httpurlconn.setdooutput(true);
httpurlconn.setdoinput(true);
httpurlconn.setusecaches(false);
// 请求的类型
httpurlconn.setrequestmethod("get");
// 获取输入流
inputstream = httpurlconn.getinputstream();
inputstreamreader = new inputstreamreader(inputstream, "utf-8");
bufferedreader = new bufferedreader(inputstreamreader);
// 从输入流读取结果
buffer = new stringbuffer();
string str = null;
while ((str = bufferedreader.readline()) != null) {
buffer.append(str);
}
} catch (exception e) {
e.printstacktrace();
} finally {
// 释放资源
if (bufferedreader != null) {
try {
bufferedreader.close();
} catch (ioexception e) {
e.printstacktrace();
}
}
if (inputstreamreader != null) {
try {
inputstreamreader.close();
} catch (ioexception e) {
e.printstacktrace();
}
}
if (inputstream != null) {
try {
inputstream.close();
} catch (ioexception e) {
e.printstacktrace();
}
}
if (httpurlconn != null) {
httpurlconn.disconnect();
}
}
return buffer.tostring();
}
private static string htmlfiter(string html) {
stringbuffer buffer = new stringbuffer();
string str1 = "";
string str2 = "";
//取出所用的范围,
//pattern p = pattern.compile("(.*)(<div class=\"panel panel-primary no-border vertical-offset-20\">)(.*)(</div>)(.*)");
pattern p = pattern.compile("(.*)(</script>)(.*)(<div class=\"wind_icon wind_low\")(.*)");
matcher m = p.matcher(html);
if (m.matches()) {
str1 = m.group(3);
//取得时间:vessel's local time:
p = pattern.compile("(.*)(time datetime=\")(.*)(\">)(.*)(</time>)(.*)(</span></strong>)(.*)");
m = p.matcher(str1);
if (m.matches()) {
str2 = m.group(5);
string str3 = m.group(7);
buffer.append("\nvessel's local time: ");
buffer.append(str2);
buffer.append(str3);
}
// <a href="/en/ais/home/centerx:120.3903/centery:32.02979/zoom:10/mmsi:414726000/shipid:650235"
// class="details_data_link">32.02979° / 120.3903°</a>
//取得当前经纬度:latitude / longitude:
p = pattern.compile("(.*)(class=\"details_data_link\">)(.*)(</a></strong></span>)(.*)");
m = p.matcher(str1);
if (m.matches()) {
str2 = m.group(3);
buffer.append("\nlatitude / longitude: ");
buffer.append(str2);
}
//取得当前速度航线speed/course:
p = pattern.compile("(.*)(<span><strong>)(.*)(</strong></span>)(.*)");
m = p.matcher(str1);
if (m.matches()) {
str2 = m.group(3);
buffer.append("\nspeed/course: ");
buffer.append(str2);
}
}
return buffer.tostring();
}
//封裝上述两个方法
public static string getdatabytwo(){
//调用第一个方法,获得html字符串
string html = httprequest("https://www.marinetraffic.com/en/ais/details/ships/shipid:650235/mmsi:414726000/vessel:yu%20ming");
//调用第二个方法,过滤掉无用的信息
string result = htmlfiter(html);
return result;
}
}
myx509trustmanager这个类的作用是提供安全证书去访问https类型 的网站
package yinhang.wang;
import java.security.cert.certificateexception;
import java.security.cert.x509certificate;
import javax.net.ssl.x509trustmanager;
public class myx509trustmanager implements x509trustmanager {
public void checkclienttrusted(x509certificate[] chain, string authtype)
throws certificateexception
{
}
public void checkservertrusted(x509certificate[] chain, string authtype)
throws certificateexception
{
}
public x509certificate[] getacceptedissuers()
{
return null;
}
}
希望初步学习正则表达式和爬数据的小伙伴们能够用到。
上一篇: IT非常道:云计算与关键业务选型