网页数据抓取之新浪新闻数据
2017-01-22 16:27
176 查看
这里根据自己的需求抓取了部分数据 ,可以根据自己的需求做相应改动:
package com.atman.baiye.store.utils;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import javapns.json.JSONArray;
import javapns.json.JSONException;
import javapns.json.JSONObject;
import com.atman.baiye.store.domain.AiCommonInfo;
/**
* remark
* dwzhou@atman.com
* 2017年1月19日上午9:57:14
*/
public class GoodsXinLangNewsUtils {
// public static final String XINLANG_URL = "http://api.search.sina.com.cn/?c=news&q=";
// public static final String XINLANG_URL = "http://www.sina.com.cn/mid/search.shtml?q=";
public static final String XINLANG_URL = "http://api.search.sina.com.cn/?c=news&t=&q=";
public static JSONArray getArrayData(String keyword, String charset) {
JSONArray jsonarray = new JSONArray();
try {
String url;
url = XINLANG_URL + URLEncoder.encode(keyword);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// String stime = sdf.format(getBeginDayOfWeek());
// String etime = sdf.format(getEndDayOfWeek());
//"&stime="+stime+"&etime="+etime+
url +="&sort=rel&highlight=1&num=10&ie=utf-8";
System.out.println("url:"+url);
String data = WebHttpClient.getBebContentByURL(url, "", false, charset);
JSONObject jsondata = new JSONObject(data);
System.out.println(jsondata.get("result").toString());
jsondata = new JSONObject(jsondata.get("result").toString());
jsonarray = new JSONArray(jsondata.get("list").toString());
System.out.println("找到相关新闻"+jsonarray.length()+"篇");
} catch (JSONException e) {
e.printStackTrace();
}
return jsonarray;
}
public static List<AiCommonInfo> getGoodsInfoList(JSONArray array, String keyword){
List<AiCommonInfo> list = new ArrayList<AiCommonInfo>();
try {
for(int i=0; i<array.length(); i++){
AiCommonInfo aiCommonInfo = new AiCommonInfo();
aiCommonInfo.setType(1009);
aiCommonInfo.setKeyword(keyword);
aiCommonInfo.setSource(4);
JSONObject json = new JSONObject(array.getJSONObject(i).toString());
String title = json.get("origin_title").toString();
System.out.println("title:"+title);
String detail_url = json.get("url").toString();
System.out.println("detail_url:"+detail_url);
String pic_url = (json.has("imgurl"))?json.get("imgurl").toString():"";
System.out.println("pic_url:"+pic_url);
String store_name = json.get("media").toString();
System.out.println("store_name:"+store_name);
String intro = json.get("intro").toString();
System.out.println("intro:"+intro);
String datetime = json.get("datetime").toString();
System.out.println("datetime:"+datetime);
aiCommonInfo.setDetailUrl(detail_url);
aiCommonInfo.setPicUrl(pic_url);
aiCommonInfo.setStoreName(store_name);
aiCommonInfo.setTitle(title);
list.add(aiCommonInfo);
}
} catch (JSONException e) {
e.printStackTrace();
}
return list;
}
public static void main(String[] args) {
JSONArray data = getArrayData("新闻联播迎新主播", "utf-8");
getGoodsInfoList(data, "新闻联播迎新主播");
}
}
package com.atman.baiye.store.utils;
import java.net.URLDecoder;
import java.net.URLEncoder;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.List;
import javapns.json.JSONArray;
import javapns.json.JSONException;
import javapns.json.JSONObject;
import com.atman.baiye.store.domain.AiCommonInfo;
/**
* remark
* dwzhou@atman.com
* 2017年1月19日上午9:57:14
*/
public class GoodsXinLangNewsUtils {
// public static final String XINLANG_URL = "http://api.search.sina.com.cn/?c=news&q=";
// public static final String XINLANG_URL = "http://www.sina.com.cn/mid/search.shtml?q=";
public static final String XINLANG_URL = "http://api.search.sina.com.cn/?c=news&t=&q=";
public static JSONArray getArrayData(String keyword, String charset) {
JSONArray jsonarray = new JSONArray();
try {
String url;
url = XINLANG_URL + URLEncoder.encode(keyword);
SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
// String stime = sdf.format(getBeginDayOfWeek());
// String etime = sdf.format(getEndDayOfWeek());
//"&stime="+stime+"&etime="+etime+
url +="&sort=rel&highlight=1&num=10&ie=utf-8";
System.out.println("url:"+url);
String data = WebHttpClient.getBebContentByURL(url, "", false, charset);
JSONObject jsondata = new JSONObject(data);
System.out.println(jsondata.get("result").toString());
jsondata = new JSONObject(jsondata.get("result").toString());
jsonarray = new JSONArray(jsondata.get("list").toString());
System.out.println("找到相关新闻"+jsonarray.length()+"篇");
} catch (JSONException e) {
e.printStackTrace();
}
return jsonarray;
}
public static List<AiCommonInfo> getGoodsInfoList(JSONArray array, String keyword){
List<AiCommonInfo> list = new ArrayList<AiCommonInfo>();
try {
for(int i=0; i<array.length(); i++){
AiCommonInfo aiCommonInfo = new AiCommonInfo();
aiCommonInfo.setType(1009);
aiCommonInfo.setKeyword(keyword);
aiCommonInfo.setSource(4);
JSONObject json = new JSONObject(array.getJSONObject(i).toString());
String title = json.get("origin_title").toString();
System.out.println("title:"+title);
String detail_url = json.get("url").toString();
System.out.println("detail_url:"+detail_url);
String pic_url = (json.has("imgurl"))?json.get("imgurl").toString():"";
System.out.println("pic_url:"+pic_url);
String store_name = json.get("media").toString();
System.out.println("store_name:"+store_name);
String intro = json.get("intro").toString();
System.out.println("intro:"+intro);
String datetime = json.get("datetime").toString();
System.out.println("datetime:"+datetime);
aiCommonInfo.setDetailUrl(detail_url);
aiCommonInfo.setPicUrl(pic_url);
aiCommonInfo.setStoreName(store_name);
aiCommonInfo.setTitle(title);
list.add(aiCommonInfo);
}
} catch (JSONException e) {
e.printStackTrace();
}
return list;
}
public static void main(String[] args) {
JSONArray data = getArrayData("新闻联播迎新主播", "utf-8");
getGoodsInfoList(data, "新闻联播迎新主播");
}
}
相关文章推荐
- 利用BeautifulSoup抓取新浪网页新闻的内容
- Python爬虫:抓取新浪新闻数据
- Python简单抓取新浪某网页新闻链接及标题
- Jsoup抓取网页数据完成一个简易的Android新闻APP
- Jsoup抓取网页数据完成一个简易的Android新闻APP
- Python爬虫:新浪新闻详情页的数据抓取(函数版)
- 抓取Web网页数据分析(c#)
- 使用System.Text.RegularExpression中的API实现网页数据的抓取
- c#抓取浏览器网页代码数据(winform)
- 抓取Web网页数据分析
- 抓取Web网页数据分析
- 从网页抓取数据的一般方法
- 用XMLHTTP对象抓取网页源代码,拆分数据写入数据库
- .net2.0抓取Web网页数据分析
- XMLHTTP抓取新浪天气新闻
- 使用System.Text.RegularExpression中的API实现网页数据的抓取
- 使用System.Text.RegularExpression中的API实现网页数据的抓取
- asp.net的抓取网页数据源码
- 抓取Web网页数据分析
- 使用HtmlAgilityPack批量抓取网页数据