您的位置:首页 > 编程语言 > Go语言


2015-12-03 15:08 639 查看
 在项目当中,获得到的数据是英文,如果想翻译为中文内容,可直接调用Google接口: https://translate.google.com.hk/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&source=btn&ssel=0&tsel=0&kc=0&tk=470115|78768&q=%22world%22

TranslateUtil.java  :

package com.ttz.crawl.util;

import java.io.UnsupportedEncodingException;
import java.net.URLEncoder;
import java.util.regex.Pattern;
import org.apache.log4j.Logger;
import com.ttz.crawl.common.Page;
import com.ttz.crawl.config.CrawlConfig;
import com.ttz.crawl.fetch.FetchRet;
import com.ttz.crawl.fetch.HttpClientFetch;
import com.ttz.crawl.proxy.ProxyPoolMan;

* 调用google 翻译接口
* @author zhaoyuchun
public class TranslateUtil
public static Pattern cntValidPatt = null;
public static Pattern cntCHSPatt = null;
public static HttpClientFetch fetcher = null;
public static String translateUrl = null;
public static Logger log = Logger.getLogger(TranslateUtil.class);

fetcher = new HttpClientFetch();
cntValidPatt = Pattern.compile("\\[\\[\\[.*?\\]\\]\\]");
cntCHSPatt = Pattern.compile("\\[\\[\\[\"“(.*?)”\",");
translateUrl = CrawlConfig.getParam("translateUrl");

public static String translate(String str)
ProxyPoolMan.enable = true;
String [] listUrlPattern = translateUrl.split("\\*");
String url = listUrlPattern[0]
+ URLEncoder.encode(listUrlPattern[1],"utf-8")
+ listUrlPattern[2]
+ URLEncoder.encode(str,"utf-8")
+ listUrlPattern[3];
FetchRet ret = fetcher.getPageRet(url, cntValidPatt, null, null);
if(ret == null || ret.page == null)
log.error("can not get the translatePage. url:" + url);
return null;
Page page = ret.page;
str = PageExtractUtil.extractPageLabel(page, cntCHSPatt);
catch (Exception e)
str = null;
ProxyPoolMan.enable = false;
return str;

public static void main(String args[]) throws UnsupportedEncodingException

home_path = C:\\Users\\zhaoyuchun\\workspace\\91z_2014
db_conf = res/db.properties
log_conf = res/log4j.properties
tair_conf = res/tair.properties
redis_conf = res/redis.properties
enable_encode_to_utf8 = true
socket_port = 50010
site_id = 3
data_name = StockTwits

base_url = http://stocktwits.com/streams/poll?stream=symbol&max=*&stream_id=*&substream=*&item_id=*http://stocktwits.com/symbol/http://stocktwits.com/ 
fetch_socket_timeout = 30000
fetch_connect_timeout = 10000
#unit: second
sleeptime = 1
#unit: minute
intervals = 60
#stop condition:0--crawl all; 1-- < page_size; 2-- begin_date <= pub_date < end_date; 3--when crawl crawled page; 4--only crawl uncrawled url, only compare url in dedup db
#if thread_no > 1, stop_type 可以有多个值,对应不同的thread,以;分割, e.g. 0;4
stop_type = 0
stock_type = 2
#set value when stop_type == 1
page_size = 1
#set value when stop_type == 2
#date format:yyyy-mm-dd
crawl_begin_date = 2012-02-20
crawl_end_date = 2012-02-21

#cookie setting
enable_cookie = false
max_url_capacity = 1000
max_page_size = 5000

regex_stream_id = 'streamId' : '(\\d+)'\,
regex_url_id = max_id: (\\d+)\,.*?poll_id: '(\\d+)'\,.*?substream:\\s'(.*?)'\,
regex_list =<li data-src=.*?id.*?(\\d+).*?created_at":"(.*?\\d+.*?) -.*?user_path":"\\\\\\/(.*?)"\,"avatar_url":"(.*?)"[\\s\\S]*?body.*?;:\\\\\\?"([\\s\\S]*?);links[\\s\\S]*?total_likes":(\\d+?)\,(?:[^<]*?replies":(\\d+)\,)?
enable_proxy = true
check_file = 10000000
proxy_file = data/proxy.xml
proxy_crawl_properties = res/proxy.properties
#the upper-bound pages to fetch from a host in a WATCH_INTERVAL
max_per_interval = 6000
#fetch density control interval, "ms" as unit
watch_interval = 60000
#about a proxy:in BLOCK_FORBIDEN_TIME ,BLOCKVALUE urls are not fetched,then the proxy is invalid for the host,delete the proxy
#and when the proxy is delete,delte the current url;
block_forbidden_item = 120000
block_span = 3600000
#about a host:in FORBIDEN_PERIOD ,FORBIDEN_COUNT urls are not fetched,the host is invalid,then delete all the host urls in FORBIDEN_PERIOD
failure_max_count = 2
#forbidden_period = 1000000
failure_watch_interval = 1000000
failure_clear_span = 3600000
proxy_pool_min_size = 500
pool_update_span = 3600000

block_span = 3600000
#about a host:in FORBIDEN_PERIOD ,FORBIDEN_COUNT urls are not fetched,the host is invalid,then delete all the host urls in FORBIDEN_PERIOD
#forbidden_count = 10
failure_value = 2
#forbidden_period = 1000000
failure_watch_interval = 1000000
failure_clear_span = 3600000
proxy_pool_minsize = 10
pool_update_span = 3600000

enable_dedup = true

enable_write = true
save_page_path = ${home_path}/data/page
max_frequent = 1800000
max_per_queue = 5000
min_page_size = 10

#tair setting
space_name = 1000
redis_key  = EXTRACT0000

#thread setting

#host setting

proxy_url = http://www.yun-daili.com/api.asp?key=20151109200040011100200069397641&getnum=200&area=2&proxytype=1 translateUrl = https://translate.google.com.hk/translate_a/single?client=t&sl=en&tl=zh-CN&hl=zh-CN&dt=bd&dt=ex&dt=ld&dt=md&dt=qca&dt=rw&dt=rm&dt=ss&dt=t&dt=at&ie=UTF-8&oe=UTF-8&source=btn&ssel=0&tsel=0&kc=0&tk=470115*|*78768&q=%22*%22
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签:  谷歌 java 翻译 接口