爬取东方财富的利润表数据记录
程序员文章站
2022-06-16 08:09:54
...
想看看各家上市公司的研发费用,所以去东方财富里面抓取数据。
1抓取准备工作
1首先找到研发费用的数据的key名字
2找去找到是如何访问得到这些数据
查看源代码
得到研发费用的名字是RDEXP,再去看是访问了什么url获取了资源的
从名字看就知道是通过ajax获取数据的,所以我们进去看ajax返回数据有没有复合要求的,找到了
http://f10.eastmoney.com/NewFinanceAnalysis/lrbAjax?companyType=4&reportDateType=0&reportType=1&endDate=&code=SH603893
从参数看code就是上市公司的代码,其他的参数,没关注。
返回的数据是一个json字符串,大家自己发掘。
从哪里获取上市公司的code呢?
同花顺有数据导出excel功能,可以把代码、市盈率、市值等指标导出,这样我们就有了所有的代码了。
2抓取数据代码
//由于同花顺导出来的xls,一直解析不了,不知道哪里有问题,所以我自己重新创建了一个xls,把同花顺的内容拷贝过去
package com.xreport2_demo;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.collections4.Put;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import org.apache.poi.ss.usermodel.BorderStyle;
import org.apache.poi.ss.usermodel.Cell;
import org.apache.poi.ss.usermodel.CellStyle;
import org.apache.poi.ss.usermodel.DateUtil;
import org.apache.poi.ss.usermodel.FillPatternType;
import org.apache.poi.ss.usermodel.Font;
import org.apache.poi.ss.usermodel.HorizontalAlignment;
import org.apache.poi.ss.usermodel.IndexedColors;
import org.apache.poi.ss.usermodel.Row;
import org.apache.poi.ss.usermodel.Sheet;
import org.apache.poi.ss.usermodel.Workbook;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.matrix.ssm.service.impl.UserServiceImpl;
//http://f10.eastmoney.com/NewFinanceAnalysis/lrbAjax?companyType=4&reportDateType=0&reportType=1&endDate=&code=SH603893
public class HttpClientDemo {
private static Logger logger = Logger.getLogger(HttpClientDemo.class);
private static String columns[] = {"空白","代码", "名称","所属行业","TTM市盈率","净利润","市净率","总市值","流通市值"};
private static String columnsNew[] = {"空白 ","代码", "名称","所属行业","TTM市盈率","净利润","市净率","总市值","流通市值","研发费用","研发日期","研发费用与总市值比"};
// private static String columns[] = {"代码"};
public static void main(String[] args) throws Exception {
//由于同花顺导出来的xls,一直解析不了,不知道哪里有问题,所以我自己重新创建了一个xls,把同花顺的内容拷贝过去
//解析出excel中的数据
List<Map<String, String>> list = readExcel("D://data//1.xls", columns);
// int i=0;
for(Map map : list){
// if(i>=10){
// break;
// }
// i++;
//怕东方财富一下给我拉黑
Thread.sleep((int)Math.random()*1000);
//获取研发费用与研发日期,因为年报有的公司没有出
getCostResarch(map);
}
//将list的值写入到新的excel
exportExcel(list,"D://data//研发费用.xls");
}
/**
* 传入单元格中的
* @param map
* @return
*/
public static void getCostResarch(Map<String,String> map){
//获取url
String companyCode = map.get("代码");
String surl = "http://f10.eastmoney.com/NewFinanceAnalysis/lrbAjax?companyType=4&reportDateType=0&reportType=1&endDate=&code="+companyCode;
BufferedReader br = null;
try {
URL url = new URL(surl);
//下载数据
InputStream is = url.openStream();
br = new BufferedReader(new InputStreamReader(is,"utf-8"));
String msg = null;
while (null != (msg = br.readLine())){
String tmp = StringEscapeUtils.unescapeJavaScript(msg);
tmp = tmp.substring(1,tmp.length()-1);
logger.info(msg);
List<Map<String,String>> listObjectFir = (List<Map<String,String>>) JSONArray.parse(tmp);
for(Map<String,String> data : listObjectFir){
if(("2019/12/31 0:00:00").equals(data.get("REPORTDATE"))){
map.put("RDEXP", data.get("RDEXP"));
map.put("REPORTDATE", data.get("REPORTDATE"));
}
//3月份的数据在前面,用年报数据覆盖3月份数据
if(("2020/3/31 0:00:00").equals(data.get("REPORTDATE"))){
map.put("RDEXP", data.get("RDEXP"));
map.put("REPORTDATE", data.get("REPORTDATE"));
}
}
}
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}finally {
if(br!=null){
try {
br.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 读取excel
* @param filePath
* @param columns
* @return
*/
public static List<Map<String, String>> readExcel(String filePath,String columns[]) {
Sheet sheet = null;
Row row = null;
Row rowHeader = null;
List<Map<String, String>> list = null;
String cellData = null;
Workbook wb = null;
if (filePath == null) {
return null;
}
String extString = filePath.substring(filePath.lastIndexOf("."));
InputStream is = null;
try {
is = new FileInputStream(filePath);
if (".xls".equals(extString)) {
wb = new HSSFWorkbook(is);
} else if (".xlsx".equals(extString)) {
wb = new XSSFWorkbook(is);
} else {
wb = null;
}
if (wb != null) {
// 用来存放表中数据
list = new ArrayList<Map<String, String>>();
// 获取第一个sheet
sheet = wb.getSheetAt(0);
// 获取最大行数
int rownum = sheet.getPhysicalNumberOfRows();
// 获取第一行
rowHeader = sheet.getRow(0);
row = sheet.getRow(0);
// 获取最大列数
int colnum = row.getPhysicalNumberOfCells();
for (int i = 1; i < rownum; i++) {
Map<String, String> map = new LinkedHashMap<String, String>();
row = sheet.getRow(i);
if (row != null) {
for (int j = 0; j < colnum; j++) {
if(columns[j].equals(getCellFormatValue(rowHeader.getCell(j)))){
cellData = (String) getCellFormatValue(row
.getCell(j));
map.put(columns[j], cellData);
}
}
} else {
break;
}
list.add(map);
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return list;
}
/** 获取单个单元格数据
* @param cell
* @return
* @author lizixiang ,2018-05-08
*/
private static String getCellFormatValue(Cell cell) {
if(cell==null){
return null;
}
String returnValue = null;
switch (cell.getCellType()) {
case NUMERIC: //数字
Double doubleValue = cell.getNumericCellValue();
// 格式化科学计数法,取一位整数
DecimalFormat df = new DecimalFormat("0");
returnValue = df.format(doubleValue);
break;
case STRING: //字符串
returnValue = cell.getStringCellValue();
break;
case BOOLEAN: //布尔
Boolean booleanValue = cell.getBooleanCellValue();
returnValue = booleanValue.toString();
break;
case BLANK: // 空值
break;
case FORMULA: // 公式
returnValue = cell.getCellFormula();
break;
case ERROR: // 故障
break;
default:
break;
}
return returnValue;
}
// public static Object getCellFormatValue(Cell cell) {
// Object cellValue = null;
// if (cell != null) {
// // 判断cell类型
// switch (cell.getCellType()) {
// case Cell.CELL_TYPE_NUMERIC: {
// cellValue = String.valueOf(cell.getNumericCellValue());
// break;
// }
// case Cell.CELL_TYPE_FORMULA: {
// // 判断cell是否为日期格式
// if (DateUtil.isCellDateFormatted(cell)) {
// // 转换为日期格式YYYY-mm-dd
// cellValue = cell.getDateCellValue();
// } else {
// // 数字
// cellValue = String.valueOf(cell.getNumericCellValue());
// }
// break;
// }
// case Cell.CELL_TYPE_STRING: {
// cellValue = cell.getRichStringCellValue().getString();
// break;
// }
// default:
// cellValue = "";
// }
// } else {
// cellValue = "";
// }
// return cellValue;
// }
/**
* 到处数据
* @param list
*/
public static void exportExcel(List<Map<String, String>> list, String exportFilePath){
// 写入数据到工作簿对象内
Workbook workbook = exportData(list);
// 以文件的形式输出工作簿对象
FileOutputStream fileOut = null;
try {
File exportFile = new File(exportFilePath);
if (!exportFile.exists()) {
exportFile.createNewFile();
}
fileOut = new FileOutputStream(exportFilePath);
workbook.write(fileOut);
fileOut.flush();
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
if (null != fileOut) {
fileOut.close();
}
if (null != workbook) {
workbook.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* 生成Excel并写入数据信息
*
* @param dataList
* 数据列表
* @return 写入数据后的工作簿对象
*/
public static Workbook exportData(List<Map<String, String>> dataList) {
// 生成xlsx的Excel
// Workbook workbook = new SXSSFWorkbook();
// 如需生成xls的Excel,请使用下面的工作簿对象,注意后续输出时文件后缀名也需更改为xls
Workbook workbook = new HSSFWorkbook();
// 生成Sheet表,写入第一行的列头
Sheet sheet = buildDataSheet(workbook);
// 构建每行的数据内容
int rowNum = 1;
for (Map data :dataList) {
if (data == null) {
continue;
}
// 输出行数据
Row row = sheet.createRow(rowNum++);
convertDataToRow(data, row);
}
return workbook;
}
/**
* 生成sheet表,并写入第一行数据(列头)
*
* @param workbook
* 工作簿对象
* @return 已经写入列头的Sheet
*/
private static Sheet buildDataSheet(Workbook workbook) {
Sheet sheet = workbook.createSheet();
// 设置列头宽度
for (int i = 0; i < columnsNew.length; i++) {
sheet.setColumnWidth(i, 4000);
}
// 设置默认行高
sheet.setDefaultRowHeight((short) 400);
// 构建头单元格样式
CellStyle cellStyle = buildHeadCellStyle(sheet.getWorkbook());
// 写入第一行各列的数据
Row head = sheet.createRow(0);
for (int i = 0; i < columnsNew.length; i++) {
Cell cell = head.createCell(i);
cell.setCellValue(columnsNew[i]);
cell.setCellStyle(cellStyle);
}
return sheet;
}
/**
* 设置第一行列头的样式
*
* @param workbook
* 工作簿对象
* @return 单元格样式对象
*/
private static CellStyle buildHeadCellStyle(Workbook workbook) {
CellStyle style = workbook.createCellStyle();
// 对齐方式设置
style.setAlignment(HorizontalAlignment.CENTER);
// 边框颜色和宽度设置
style.setBorderBottom(BorderStyle.THIN);
style.setBottomBorderColor(IndexedColors.BLACK.getIndex()); // 下边框
style.setBorderLeft(BorderStyle.THIN);
style.setLeftBorderColor(IndexedColors.BLACK.getIndex()); // 左边框
style.setBorderRight(BorderStyle.THIN);
style.setRightBorderColor(IndexedColors.BLACK.getIndex()); // 右边框
style.setBorderTop(BorderStyle.THIN);
style.setTopBorderColor(IndexedColors.BLACK.getIndex()); // 上边框
// 设置背景颜色
style.setFillForegroundColor(IndexedColors.GREY_25_PERCENT.getIndex());
style.setFillPattern(FillPatternType.SOLID_FOREGROUND);
// 粗体字设置
Font font = workbook.createFont();
font.setBold(true);
style.setFont(font);
return style;
}
/**
* 将数据转换成行
*
* @param data
* 源数据
* @param row
* 行对象
* @return
*/
private static void convertDataToRow(Map<String, String> data, Row row) {
int cellNum = 0;
Cell cell;
// 空白
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("空白") ? "" : data.get("空白"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("代码") ? "" : data.get("代码"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("名称") ? "" : data.get("名称"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("所属行业") ? "" : data.get("所属行业"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("TTM市盈率") ? "" : data.get("TTM市盈率"));
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("市净率") ? "" : data.get("市净率"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("净利润") ? "" : data.get("净利润"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("总市值") ? "" : data.get("总市值"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("流通市值") ? "" : data.get("流通市值"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("RDEXP") ? "" : data.get("RDEXP"));
// 代码
cell = row.createCell(cellNum++);
cell.setCellValue(null == data.get("REPORTDATE") ? "" : data.get("REPORTDATE"));
// 代码
cell = row.createCell(cellNum++);
if(data.get("RDEXP")==null||"".equals(data.get("RDEXP"))){
cell.setCellValue("0");
}else{
try {
cell.setCellValue(Double.parseDouble(data.get("RDEXP"))/Double.parseDouble(data.get("总市值")));
} catch (Exception e) {
cell.setCellValue("出错");
}
}
}
}
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-collections4 -->
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-collections4</artifactId>
<version>4.4</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.poi/poi-ooxml -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml-schemas</artifactId>
<version>4.0.0</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>4.0.1</version>
</dependency>
<dependency>
<groupId>org.apache.xmlbeans</groupId>
<artifactId>xmlbeans</artifactId>
<version>2.6.0</version>
</dependency>
<!-- <dependency> <groupId>com.bstek.ureport</groupId> <artifactId>ureport2-console</artifactId>
<version>2.2.9</version> </dependency> -->
<!-- https://mvnrepository.com/artifact/commons-codec/commons-codec -->
<dependency>
<groupId>commons-codec</groupId>
<artifactId>commons-codec</artifactId>
<version>1.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/cn.bestwu/common-lang -->
<!-- <dependency> <groupId>cn.bestwu</groupId> <artifactId>common-lang</artifactId>
<version>1.1.5</version> </dependency> -->
<!-- https://mvnrepository.com/artifact/commons-lang/commons-lang -->
<dependency>
<groupId>commons-lang</groupId>
<artifactId>commons-lang</artifactId>
<version>2.6</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 -->
<!-- <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId>
<version>3.10</version> </dependency> -->
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.61</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<!-- https://mvnrepository.com/artifact/commons-httpclient/commons-httpclient -->
<dependency>
<groupId>commons-httpclient</groupId>
<artifactId>commons-httpclient</artifactId>
<version>3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/commons-logging/commons-logging -->
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging</artifactId>
<version>1.1.1</version>
</dependency>
结果
上一篇: CST—电磁及EMC仿真工具
下一篇: springboot:condition