您的位置:首页 > 其它

hive UDF 分享 通过IP地址获取 IDC/省份/城市

2014-05-12 23:25 288 查看
简述:

简单的2分查找算法,根据IP地址定位IP所属的IP段,然后获取IP段的IDC/省份/城市的信息。

输入:IP地理信息文件,一般地址库拿到后需要格式化一下,参考:
1. 如果省份是null 或者 '' ,city是null或者'' =》 省份=其他 and 城市=其他
2. 省份非空且为直辖市,但是城市非直辖市 = 》 城市=直辖市
3. 省份非空且非直辖市,但是城市为空 =》城市=其他
4. 省份或城市中有(、\等非法信息 = 》 省份=其他 and 城市=其他

/user/hadoop/IP.csv


格式:

编号,开始IP(long),结束IP(long),省份,城市,IDC,开始IP,结束IP
29990,16777472,16778239,福建省,其他,电信,1.0.1.0,1.0.3.255
29991,16779264,16781311,广东省,其他,电信,1.0.8.0,1.0.15.255
29992,16785408,16793599,广东省,其他,电信,1.0.32.0,1.0.63.255


用法 & 输出:

编辑打包或者编译到hive中参考这篇,这里不在多说:http://my.oschina.net/wangjiankui/blog/64230

get_ip_location_new(visitip,'IDC') //返回IDC信息

get_ip_location_new(visitip,'REGION') //返回省份信息

get_ip_location_new(visitip,'CITY') //返回城市信息

代码:

package com.xxx.hive.udf;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.Text;

public class UDFGetIPLocationNew extends UDF
{
public static List<String> map = new ArrayList();
public static long[] start_from_index;
public static long[] end_to_index;
public static Map<Long, String> idcCache = new HashMap();
public static Map<Long, String> regionCache = new HashMap();
public static Map<Long, String> cityCache = new HashMap();

private void LoadIPLocation()
{
Configuration conf = new Configuration();
String namenode = conf.get("fs.default.name");
String uri = namenode + "/user/hadoop/IP.csv";
FileSystem fs = null;
FSDataInputStream in = null;
BufferedReader d = null;
try
{
fs = FileSystem.get(URI.create(uri), conf);
in = fs.open(new Path(uri));
d = new BufferedReader(new InputStreamReader(in));
String s = null;
while (true)
{
s = d.readLine();
if (s == null) {
break;
}
map.add(s);
}
}
catch (IOException e) {
e.printStackTrace();
} finally {
IOUtils.closeStream(in);
}
}

public static int binarySearch(long[] start, long[] end, long ip)
{
int low = 0;
int high = start.length - 1;
while (low <= high) {
int middle = (low + high) / 2;
if ((ip >= start[middle]) && (ip <= end[middle]))
return middle;
if (ip < start[middle])
high = middle - 1;
else {
low = middle + 1;
}
}
return -1;
}

public static long ip2long(String ip)
{
if (ip.matches("\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}")) {
String[] ips = ip.split("[.]");
long ipNum = 0L;
if (ips == null) {
return 0L;
}
for (int i = 0; i < ips.length; i++) {
ipNum = ipNum << 8 | Long.parseLong(ips[i]);
}

return ipNum;
}
return 0L;
}

public String evaluate(Text ip, Text which) {
long ipLong = ip2long(ip.toString());
String whichString = which.toString();

if ((!whichString.equals("IDC")) && (!whichString.equals("REGION")) && (!whichString.equals("CITY")))
{
return "Unknown Args!use(IDC or REGION or CITY)";
}

if (map.size() == 0) {
LoadIPLocation();
start_from_index = new long[map.size()];
end_to_index = new long[map.size()];
for (int i = 0; i < map.size(); i++) {
StringTokenizer token = new StringTokenizer((String)map.get(i), ",");
token.nextToken();
start_from_index[i] = Long.parseLong(token.nextToken());
end_to_index[i] = Long.parseLong(token.nextToken());
}

}

int ipindex = 0;
if (((whichString.equals("IDC")) && (!idcCache.containsKey(Long.valueOf(ipLong)))) || ((whichString.equals("REGION")) && (!regionCache.containsKey(Long.valueOf(ipLong)))) || ((whichString.equals("CITY")) && (!cityCache.containsKey(Long.valueOf(ipLong)))))
{
ipindex = binarySearch(start_from_index, end_to_index, ipLong);
}
if (ipindex == 0) {
if (whichString.equals("IDC"))
return (String)idcCache.get(Long.valueOf(ipLong));
if (whichString.equals("REGION"))
return (String)regionCache.get(Long.valueOf(ipLong));
if (whichString.equals("CITY")) {
return (String)cityCache.get(Long.valueOf(ipLong));
}
return "Error";
}
if (ipindex == -1) {
return "Other IDC";
}

String[] location = ((String)map.get(ipindex)).split(",");
if (whichString.equals("IDC")) {
idcCache.put(Long.valueOf(ipLong), location[5]);
return location[5];
}if (whichString.equals("REGION")) {
regionCache.put(Long.valueOf(ipLong), location[3]);
return location[3];
}if (whichString.equals("CITY")) {
cityCache.put(Long.valueOf(ipLong), location[4]);
return location[4];
}
return "Error";
}

public static void main(String[] args)
{
long startTime = System.currentTimeMillis();
System.out.println("now:" + startTime);
UDFGetIPLocationNew getIPLocation = new UDFGetIPLocationNew();
Text ip = new Text("112.122.64.0");

System.out.printf("ip = %s, %s, %s, %s\n", new Object[] { ip, getIPLocation.evaluate(ip, new Text("IDC")), getIPLocation.evaluate(ip, new Text("REGION")), getIPLocation.evaluate(ip, new Text("CITY")) });

long endTime = System.currentTimeMillis();
System.out.println("over:" + endTime);
System.out.println("count:" + (endTime - startTime) * 1.0D / 1000.0D);
}
}

#2015-06-02
补充说明:
程序中逻辑有些不严谨,照抄请慎重,最后自己梳理下逻辑修改下
内容来自用户分享和网络整理,不保证内容的准确性,如有侵权内容,可联系管理员处理 点击这里给我发消息
标签: