使用Google接口实时翻译
2015-12-03 15:08
639 查看
在项目当中,获得到的数据是英文,如果想翻译为中文内容,可直接调用Google接口: https://translate.google.com.hk/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&source=btn&ssel=0&tsel=0&kc=0&tk=470115|78768&q=%22world%22
当然,Google服务需要翻墙,所以要用到代理,这里代理的获取就暂且不细说(可以代理商购买)。下面主要提供java程序与配置文件参考:
TranslateUtil.java :
package com.ttz.crawl.util; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.regex.Pattern; import org.apache.log4j.Logger; import com.ttz.crawl.common.Page; import com.ttz.crawl.config.CrawlConfig; import com.ttz.crawl.fetch.FetchRet; import com.ttz.crawl.fetch.HttpClientFetch; import com.ttz.crawl.proxy.ProxyPoolMan; /** * 调用google 翻译接口 * @author zhaoyuchun */ public class TranslateUtil { public static Pattern cntValidPatt = null; public static Pattern cntCHSPatt = null; public static HttpClientFetch fetcher = null; public static String translateUrl = null; public static Logger log = Logger.getLogger(TranslateUtil.class); static { fetcher = new HttpClientFetch(); cntValidPatt = Pattern.compile("\\[\\[\\[.*?\\]\\]\\]"); cntCHSPatt = Pattern.compile("\\[\\[\\[\"“(.*?)”\","); translateUrl = CrawlConfig.getParam("translateUrl"); } public static String translate(String str) { ProxyPoolMan.enable = true; String [] listUrlPattern = translateUrl.split("\\*"); try { String url = listUrlPattern[0] + URLEncoder.encode(listUrlPattern[1],"utf-8") + listUrlPattern[2] + URLEncoder.encode(str,"utf-8") + listUrlPattern[3]; FetchRet ret = fetcher.getPageRet(url, cntValidPatt, null, null); if(ret == null || ret.page == null) { log.error("can not get the translatePage. url:" + url); return null; } Page page = ret.page; //来处理得到的字符串 str = PageExtractUtil.extractPageLabel(page, cntCHSPatt); } catch (Exception e) { log.error(e); str = null; } finally { ProxyPoolMan.enable = false; } return str; } public static void main(String args[]) throws UnsupportedEncodingException { translate("proxy"); } }
StockTwits.properties:
#common home_path = C:\\Users\\zhaoyuchun\\workspace\\91z_2014 db_conf = res/db.properties log_conf = res/log4j.properties tair_conf = res/tair.properties redis_conf = res/redis.properties enable_encode_to_utf8 = true socket_port = 50010 site_id = 3 data_name = StockTwits #以分割不同的url base_url = http://stocktwits.com/streams/poll?stream=symbol&max=*&stream_id=*&substream=*&item_id=*http://stocktwits.com/symbol/http://stocktwits.com/ fetch_socket_timeout = 30000 fetch_connect_timeout = 10000 #unit: second sleeptime = 1 #unit: minute intervals = 60 #stop condition:0--crawl all; 1-- < page_size; 2-- begin_date <= pub_date < end_date; 3--when crawl crawled page; 4--only crawl uncrawled url, only compare url in dedup db #if thread_no > 1, stop_type 可以有多个值,对应不同的thread,以;分割, e.g. 0;4 stop_type = 0 stock_type = 2 #set value when stop_type == 1 page_size = 1 #set value when stop_type == 2 #date format:yyyy-mm-dd crawl_begin_date = 2012-02-20 crawl_end_date = 2012-02-21 #cookie setting enable_cookie = false #当内存中存储的失败url超过max_capacity之后,会写入文件 max_url_capacity = 1000 max_page_size = 5000 #regex regex_stream_id = 'streamId' : '(\\d+)'\, regex_url_id = max_id: (\\d+)\,.*?poll_id: '(\\d+)'\,.*?substream:\\s'(.*?)'\, regex_list =<li data-src=.*?id.*?(\\d+).*?created_at":"(.*?\\d+.*?) -.*?user_path":"\\\\\\/(.*?)"\,"avatar_url":"(.*?)"[\\s\\S]*?body.*?;:\\\\\\?"([\\s\\S]*?);links[\\s\\S]*?total_likes":(\\d+?)\,(?:[^<]*?replies":(\\d+)\,)? #proxy enable_proxy = true check_file = 10000000 proxy_file = data/proxy.xml proxy_crawl_properties = res/proxy.properties #the upper-bound pages to fetch from a host in a WATCH_INTERVAL max_per_interval = 6000 #fetch density control interval, "ms" as unit watch_interval = 60000 #about a proxy:in BLOCK_FORBIDEN_TIME ,BLOCKVALUE urls are not fetched,then the proxy is invalid for the host,delete the proxy #and when the proxy is delete,delte the current url; block_forbidden_item = 120000 block_span = 3600000 #about a host:in FORBIDEN_PERIOD ,FORBIDEN_COUNT urls are not fetched,the host is invalid,then delete all the host urls in FORBIDEN_PERIOD failure_max_count = 2 #forbidden_period = 1000000 failure_watch_interval = 1000000 failure_clear_span = 3600000 proxy_pool_min_size = 500 pool_update_span = 3600000 block_span = 3600000 #about a host:in FORBIDEN_PERIOD ,FORBIDEN_COUNT urls are not fetched,the host is invalid,then delete all the host urls in FORBIDEN_PERIOD #forbidden_count = 10 failure_value = 2 #forbidden_period = 1000000 failure_watch_interval = 1000000 failure_clear_span = 3600000 proxy_pool_minsize = 10 pool_update_span = 3600000 #dedup enable_dedup = true #writer enable_write = true save_page_path = ${home_path}/data/page max_frequent = 1800000 max_per_queue = 5000 min_page_size = 10 #tair setting space_name = 1000 redis_key = EXTRACT0000 #thread setting thread_numbers=1 thread_no=1 #host setting machine_id=1 proxy_url = http://www.yun-daili.com/api.asp?key=20151109200040011100200069397641&getnum=200&area=2&proxytype=1 translateUrl = https://translate.google.com.hk/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&source=btn&ssel=0&tsel=0&kc=0&tk=470115*|*78768&q=%22*%22
相关文章推荐
- java对世界各个时区(TimeZone)的通用转换处理方法(转载)
- java-注解annotation
- java-模拟tomcat服务器
- java-用HttpURLConnection发送Http请求.
- java-WEB中的监听器Lisener
- Android IPC进程间通讯机制
- Android Native 绘图方法
- Android java 与 javascript互访(相互调用)的方法例子
- 介绍一款信息管理系统的开源框架---jeecg
- 聚类算法之kmeans算法java版本
- java实现 PageRank算法
- PropertyChangeListener简单理解
- 插入排序
- 冒泡排序
- 堆排序
- 快速排序
- 二叉查找树
- [原创]java局域网聊天系统