R文本挖掘处理
2017-11-07 14:27
176 查看
rm(list = ls())
library(tidyr)
library(dplyr)
library(data.table)
library(readr)
library(lubridate)
library(ggplot2)
library(readxl)
library(rJava)
library(xlsxjars)
library(xlsx)
library(devtools)
library(recharts)
library(REmap)
library(plotly)
library(stringr)
library(corrplot)
library(psych)
library(lars)
library(car)
library(GGally)
library(MASS)
library(ridge)
getwd()
setwd("C:\\Users\\Administrator\\Desktop\\外访地址有效性判定")
dir()
wfdata <- read_excel('waifang.xlsx')
wfpp <- read_excel('匹配数据.xlsx')
dzhi <- fread('dizhi.txt',encoding = 'UTF-8')
visit0 <- fread('visit.txt',encoding = 'UTF-8')
# data1 <- wfpp %>% group_by(lx) %>% summarise(yx = sum(最终有效),zsh = sum(计数),zhb = yx/zsh)
#
#
# wfpp$age <- 2017-as.numeric(substr(wfpp$shfzhID,7,10))
#
#
# pp1 <- wfpp %>% select()
#
# wfdata1 <- left_join(wfdata,wfpp,)
#
#
# test1 <- wfpp %>% filter(最终有效==1)
#
# test2 <- wfpp %>% filter(最终有效==0)
#
#
# test2 <- distinct(test2)
#
# test2$address <- gsub(pattern="内蒙古自治区", replacement="", test2$address)
#
# test2$address <- gsub(pattern="内蒙古", replacement="", test2$address)
#
#
# test2$address <- gsub(pattern="呼和浩特市", replacement="", test2$address)
#110101195801082086,重复例子
bind1 <- left_join(visit0,dzhi,by=c('address'='address','id_no'='id_no'))
#地址类型匹配
test1 <- bind1 %>% group_by(address_type) %>% tally()
test2 <- dzhi %>% group_by(address_type) %>% tally()
# test2$单位 <- grepl(pattern = '单位地址|公司|单位',test2$address_type)
#
# test2$户籍 <- grepl(pattern = '户籍地址|户籍',test2$address_type)
#
# test2$家庭 <- grepl(pattern = '家庭地址|家庭',test2$address_type)
#
# test2$通讯 <- grepl(pattern = '通讯',test2$address_type)
test1$address_type[which(grepl(pattern = '户籍',test1$address_type) == TRUE)] <- '户籍地址'
test1$address_type[which(grepl(pattern = '家庭|住宅|居住|新地址|住址',test1$address_type) == TRUE)] <- '家庭地址'
test1$address_type[which(grepl(pattern = '公司|单位|工作',test1$address_type) == TRUE)] <- '单位地址'
test1$address_type[which(grepl(pattern = '通讯|邮寄',test1$address_type) == TRUE)] <- '通讯地址'
test1$address_type[which(!(test1$address_type %in% c('户籍地址','家庭地址','单位地址','通讯地址')))] <- '其他'
test3 <- test1 %>% group_by(address_type) %>% summarise(sum1 = sum(n))
test1 <- bind1 %>% group_by(address_type) %>% tally()
test1$人行 <- grepl(pattern = '人行',test1$address_type)
test1$新 <- grepl(pattern = '新',test1$address_type)
#bind1处理
bind1$newaddress_type <- bind1$address_type
bind1$newaddress_type[which(grepl(pattern = '户籍',bind1$newaddress_type) == TRUE)] <- '户籍地址'
bind1$newaddress_type[which(grepl(
4000
pattern = '家庭|住宅|居住|新地址|住址',bind1$newaddress_type) == TRUE)] <- '家庭地址'
bind1$newaddress_type[which(grepl(pattern = '公司|单位|工作',bind1$newaddress_type) == TRUE)] <- '单位地址'
bind1$newaddress_type[which(grepl(pattern = '通讯|邮寄',bind1$newaddress_type) == TRUE)] <- '通讯地址'
bind1$newaddress_type[which(!(bind1$newaddress_type %in% c('户籍地址','家庭地址','单位地址','通讯地址')))] <- '其他'
bind1$bz <- bind1$address_type
bind1$bz1 <- grepl(pattern = '人行',bind1$address_type)
bind1$bz2 <- grepl(pattern = '新',bind1$address_type)
bind1$bz1[which(bind1$bz1 == TRUE)] <- '人行'
bind1$bz1[which(bind1$bz1 == FALSE)] <-''
bind1$bz2[which(bind1$bz2 == TRUE)] <- '新'
bind1$bz2[which(bind1$bz2 == FALSE)] <-''
bind1$bz <- paste(bind1$bz1,bind1$bz2,sep = '')
bind1$bz[which(bind1$bz == '')] <- '无'
bind1 <- bind1[,1:73]
a1 <- bind1 %>% filter(comment_a1 == '外访地址存在')
bind1$address <- gsub(pattern="中国", replacement="", bind1$address)
bind1$province <- str_extract(bind1$address,"^\\w{2,3}省")
bind1$city <- str_extract(bind1$address,"\\w.市")
bind1$city <- gsub(pattern=".呼和浩特", replacement="呼和浩特", bind1$city)
bind1$zhen <- str_extract(bind1$address,"\\w.{1,2}[镇|区|乡]")
bind1$detail <- str_extract(bind1$address,"\\.*[号|房|室|户|部|村|路|1]")
bind1$address <- gsub(pattern=",|)|)| |-|\\.|\\*", replacement="", bind1$address)
bind1$detail1 <- substr(bind1$address,nchar(bind1$address),nchar(bind1$address))
mowei <- substr(bind1$address,nchar(bind1$address),nchar(bind1$address))
dizhilxtj <- bind1 %>% group_by(comment_a1) %>% tally()
test11 <- bind1 %>% group_by(comment_a1,detail1) %>% tally()
test11 <- left_join(test11,dizhilxtj,by="comment_a1")
test11$zhb <- test11$n.x/test11$n.y
test11_1 <- bind1 %>% group_by(comment_a1,newaddress_type) %>% tally()
mowei1 <- data.frame(table(mowei))
ggplot(test11_1, aes(x=newaddress_type, y=n, fill=comment_a1)) +
geom_bar(stat="identity", width=0.5, position=position_dodge(0.6))
test11_2 <- bind1 %>% group_by(comment_a1,comment_c1) %>% tally()
tbnum <- bind1 %>% group_by(case_no) %>% tally()
bind1 <- bind1 %>% filter(nchar(bind1$id_no)==18)
bind1$year <- 2017-as.numeric(substr(bind1$id_no,7,10))
bind1$sex <- as.numeric(substr(bind1$id_no,17,17))%%2
colnames(tbnum) <- c('case_no','countnum')
bind1 <- left_join(bind1,tbnum,by="case_no")
test11_3 <- bind1 %>% filter(comment_a1 == "外访地址存在") %>% filter(!(comment_c1 %in% c('确认持卡人不在此','确认持卡人在此','请选择') ))
#####特征x7家庭地址与户籍地址是否相同#####1:表示地址相同,0:表示地址不同
dzhi$address_type[which(grepl(pattern = '户籍',dzhi$address_type) == TRUE)] <- '户籍地址'
dzhi$address_type[which(grepl(pattern = '家庭|住宅|居住|新地址|住址',dzhi$address_type) == TRUE)] <- '家庭地址'
dzhi$address_type[which(grepl(pattern = '公司|单位|工作',dzhi$address_type) == TRUE)] <- '单位地址'
dzhi$address_type[which(grepl(pattern = '通讯|邮寄',dzhi$address_type) == TRUE)] <- '通讯地址'
dzhi$address_type[which(!(dzhi$address_type %in% c('户籍地址','家庭地址','单位地址','通讯地址')))] <- '其他'
x7_1 <- dzhi %>% filter(address_type == '家庭地址')
x7_2 <- dzhi %>% filter(address_type == '户籍地址')
x7 <- left_join(x7_1,x7_2,by="id_no")
x7$pd <- x7$address.x==x7$address.y
x7$pd[which(x7$pd==TRUE)] <- 1
x7$pd[which(x7$pd==FALSE)] <- 0
x7$pd[is.na(x7$pd)] <- 0
x7result <- x7 %>% group_by(id_no) %>% summarise(bz = sum(pd))
x7result$bz[which(x7result$bz>0)] <- 1
x7result <- distinct(x7result)
colnames(x7result) <- c('id_no','hjshf')
bind1 <- left_join(bind1,x7result,by='id_no')
bind1$hjshf[is.na(bind1$hjshf)] <- 0
#####x11地址是否含有数字#####1:表示含有数字,0:表示不含有数字
bind1$numyx <- grepl(pattern = '[0-9]|[0-9]',bind1$address)
bind1$numyx[which(bind1$numyx==TRUE)] <- 1
bind1$numyx[which(bind1$numyx==FALSE)] <- 0
#####x9地址是城市或者农村#####0:表示城市,1:表示农村
bind1$cityornot <- grepl(pattern = '村|屯|队|组|庄',bind1$address)
bind1$cityornot[which(bind1$cityornot == TRUE)] <- 1
bind1$cityornot[which(bind1$cityornot == FALSE)] <- 0
#####x10身份证与户籍地址是否相同#####1:表示相同,0:表示不同
# x10 <- bind1 %>% filter(newaddress_type == '户籍地址')
#
bind1$shfzh4 <- as.numeric(substr(bind1$id_no,1,4))
datashfzh <- read_excel('datashfzh.xlsx')
bind1 <- left_join(bind1,datashfzh,by=c('shfzh4'='shfzh4'))
bind1$city.x <- gsub(pattern="市", replacement="", bind1$city.x)
bind1$province.y <- gsub(pattern="省", replacement="", bind1$province.y)
bind1$province.y <- gsub(pattern="[0-9]", replacement="", bind1$province.y)
unique(bind1$city.x)
bind1$shhjpd <- bind1$city.x == bind1$city.y
bind1$shhjpd[which(bind1$shhjpd == TRUE)] <- 1
bind1$shhjpd[which(bind1$shhjpd == FALSE)] <- 0
bind1$shhjpd[is.na(bind1$shhjpd)] <- 0
#####x8地址是否具体#####
bind1$dizhidetail <- 1
bind1$dizhidetail[which(bind1$detail1 %in% c('村','组','楼','屯','队'))] <- 0
bind1$dizhidetail[nchar(bind1$address) <= 15 ] <- 0
dataresult <- cbind(bind1$newaddress_type,bind1$year,bind1$sex,bind1$customer_name,bind1$bz,bind1$case_money,
bind1$hjshf,bind1$dizhidetail,bind1$cityornot,bind1$shhjpd,bind1$numyx)
dataresult <- data.frame(dataresult)
dataresult$address <- bind1$address
colnames(dataresult) <- c('地址类型','年龄','性别','客户类型','地址来源','欠款金额','家庭地址与户籍地址是否相同',
'地址是否具体','城市或者农村','身份证与户籍地址是否相同','地址是否含有数字','外访地址')
write.csv(dataresult,'特征数据.csv')
library(tidyr)
library(dplyr)
library(data.table)
library(readr)
library(lubridate)
library(ggplot2)
library(readxl)
library(rJava)
library(xlsxjars)
library(xlsx)
library(devtools)
library(recharts)
library(REmap)
library(plotly)
library(stringr)
library(corrplot)
library(psych)
library(lars)
library(car)
library(GGally)
library(MASS)
library(ridge)
getwd()
setwd("C:\\Users\\Administrator\\Desktop\\外访地址有效性判定")
dir()
wfdata <- read_excel('waifang.xlsx')
wfpp <- read_excel('匹配数据.xlsx')
dzhi <- fread('dizhi.txt',encoding = 'UTF-8')
visit0 <- fread('visit.txt',encoding = 'UTF-8')
# data1 <- wfpp %>% group_by(lx) %>% summarise(yx = sum(最终有效),zsh = sum(计数),zhb = yx/zsh)
#
#
# wfpp$age <- 2017-as.numeric(substr(wfpp$shfzhID,7,10))
#
#
# pp1 <- wfpp %>% select()
#
# wfdata1 <- left_join(wfdata,wfpp,)
#
#
# test1 <- wfpp %>% filter(最终有效==1)
#
# test2 <- wfpp %>% filter(最终有效==0)
#
#
# test2 <- distinct(test2)
#
# test2$address <- gsub(pattern="内蒙古自治区", replacement="", test2$address)
#
# test2$address <- gsub(pattern="内蒙古", replacement="", test2$address)
#
#
# test2$address <- gsub(pattern="呼和浩特市", replacement="", test2$address)
#110101195801082086,重复例子
bind1 <- left_join(visit0,dzhi,by=c('address'='address','id_no'='id_no'))
#地址类型匹配
test1 <- bind1 %>% group_by(address_type) %>% tally()
test2 <- dzhi %>% group_by(address_type) %>% tally()
# test2$单位 <- grepl(pattern = '单位地址|公司|单位',test2$address_type)
#
# test2$户籍 <- grepl(pattern = '户籍地址|户籍',test2$address_type)
#
# test2$家庭 <- grepl(pattern = '家庭地址|家庭',test2$address_type)
#
# test2$通讯 <- grepl(pattern = '通讯',test2$address_type)
test1$address_type[which(grepl(pattern = '户籍',test1$address_type) == TRUE)] <- '户籍地址'
test1$address_type[which(grepl(pattern = '家庭|住宅|居住|新地址|住址',test1$address_type) == TRUE)] <- '家庭地址'
test1$address_type[which(grepl(pattern = '公司|单位|工作',test1$address_type) == TRUE)] <- '单位地址'
test1$address_type[which(grepl(pattern = '通讯|邮寄',test1$address_type) == TRUE)] <- '通讯地址'
test1$address_type[which(!(test1$address_type %in% c('户籍地址','家庭地址','单位地址','通讯地址')))] <- '其他'
test3 <- test1 %>% group_by(address_type) %>% summarise(sum1 = sum(n))
test1 <- bind1 %>% group_by(address_type) %>% tally()
test1$人行 <- grepl(pattern = '人行',test1$address_type)
test1$新 <- grepl(pattern = '新',test1$address_type)
#bind1处理
bind1$newaddress_type <- bind1$address_type
bind1$newaddress_type[which(grepl(pattern = '户籍',bind1$newaddress_type) == TRUE)] <- '户籍地址'
bind1$newaddress_type[which(grepl(
4000
pattern = '家庭|住宅|居住|新地址|住址',bind1$newaddress_type) == TRUE)] <- '家庭地址'
bind1$newaddress_type[which(grepl(pattern = '公司|单位|工作',bind1$newaddress_type) == TRUE)] <- '单位地址'
bind1$newaddress_type[which(grepl(pattern = '通讯|邮寄',bind1$newaddress_type) == TRUE)] <- '通讯地址'
bind1$newaddress_type[which(!(bind1$newaddress_type %in% c('户籍地址','家庭地址','单位地址','通讯地址')))] <- '其他'
bind1$bz <- bind1$address_type
bind1$bz1 <- grepl(pattern = '人行',bind1$address_type)
bind1$bz2 <- grepl(pattern = '新',bind1$address_type)
bind1$bz1[which(bind1$bz1 == TRUE)] <- '人行'
bind1$bz1[which(bind1$bz1 == FALSE)] <-''
bind1$bz2[which(bind1$bz2 == TRUE)] <- '新'
bind1$bz2[which(bind1$bz2 == FALSE)] <-''
bind1$bz <- paste(bind1$bz1,bind1$bz2,sep = '')
bind1$bz[which(bind1$bz == '')] <- '无'
bind1 <- bind1[,1:73]
a1 <- bind1 %>% filter(comment_a1 == '外访地址存在')
bind1$address <- gsub(pattern="中国", replacement="", bind1$address)
bind1$province <- str_extract(bind1$address,"^\\w{2,3}省")
bind1$city <- str_extract(bind1$address,"\\w.市")
bind1$city <- gsub(pattern=".呼和浩特", replacement="呼和浩特", bind1$city)
bind1$zhen <- str_extract(bind1$address,"\\w.{1,2}[镇|区|乡]")
bind1$detail <- str_extract(bind1$address,"\\.*[号|房|室|户|部|村|路|1]")
bind1$address <- gsub(pattern=",|)|)| |-|\\.|\\*", replacement="", bind1$address)
bind1$detail1 <- substr(bind1$address,nchar(bind1$address),nchar(bind1$address))
mowei <- substr(bind1$address,nchar(bind1$address),nchar(bind1$address))
dizhilxtj <- bind1 %>% group_by(comment_a1) %>% tally()
test11 <- bind1 %>% group_by(comment_a1,detail1) %>% tally()
test11 <- left_join(test11,dizhilxtj,by="comment_a1")
test11$zhb <- test11$n.x/test11$n.y
test11_1 <- bind1 %>% group_by(comment_a1,newaddress_type) %>% tally()
mowei1 <- data.frame(table(mowei))
ggplot(test11_1, aes(x=newaddress_type, y=n, fill=comment_a1)) +
geom_bar(stat="identity", width=0.5, position=position_dodge(0.6))
test11_2 <- bind1 %>% group_by(comment_a1,comment_c1) %>% tally()
tbnum <- bind1 %>% group_by(case_no) %>% tally()
bind1 <- bind1 %>% filter(nchar(bind1$id_no)==18)
bind1$year <- 2017-as.numeric(substr(bind1$id_no,7,10))
bind1$sex <- as.numeric(substr(bind1$id_no,17,17))%%2
colnames(tbnum) <- c('case_no','countnum')
bind1 <- left_join(bind1,tbnum,by="case_no")
test11_3 <- bind1 %>% filter(comment_a1 == "外访地址存在") %>% filter(!(comment_c1 %in% c('确认持卡人不在此','确认持卡人在此','请选择') ))
#####特征x7家庭地址与户籍地址是否相同#####1:表示地址相同,0:表示地址不同
dzhi$address_type[which(grepl(pattern = '户籍',dzhi$address_type) == TRUE)] <- '户籍地址'
dzhi$address_type[which(grepl(pattern = '家庭|住宅|居住|新地址|住址',dzhi$address_type) == TRUE)] <- '家庭地址'
dzhi$address_type[which(grepl(pattern = '公司|单位|工作',dzhi$address_type) == TRUE)] <- '单位地址'
dzhi$address_type[which(grepl(pattern = '通讯|邮寄',dzhi$address_type) == TRUE)] <- '通讯地址'
dzhi$address_type[which(!(dzhi$address_type %in% c('户籍地址','家庭地址','单位地址','通讯地址')))] <- '其他'
x7_1 <- dzhi %>% filter(address_type == '家庭地址')
x7_2 <- dzhi %>% filter(address_type == '户籍地址')
x7 <- left_join(x7_1,x7_2,by="id_no")
x7$pd <- x7$address.x==x7$address.y
x7$pd[which(x7$pd==TRUE)] <- 1
x7$pd[which(x7$pd==FALSE)] <- 0
x7$pd[is.na(x7$pd)] <- 0
x7result <- x7 %>% group_by(id_no) %>% summarise(bz = sum(pd))
x7result$bz[which(x7result$bz>0)] <- 1
x7result <- distinct(x7result)
colnames(x7result) <- c('id_no','hjshf')
bind1 <- left_join(bind1,x7result,by='id_no')
bind1$hjshf[is.na(bind1$hjshf)] <- 0
#####x11地址是否含有数字#####1:表示含有数字,0:表示不含有数字
bind1$numyx <- grepl(pattern = '[0-9]|[0-9]',bind1$address)
bind1$numyx[which(bind1$numyx==TRUE)] <- 1
bind1$numyx[which(bind1$numyx==FALSE)] <- 0
#####x9地址是城市或者农村#####0:表示城市,1:表示农村
bind1$cityornot <- grepl(pattern = '村|屯|队|组|庄',bind1$address)
bind1$cityornot[which(bind1$cityornot == TRUE)] <- 1
bind1$cityornot[which(bind1$cityornot == FALSE)] <- 0
#####x10身份证与户籍地址是否相同#####1:表示相同,0:表示不同
# x10 <- bind1 %>% filter(newaddress_type == '户籍地址')
#
bind1$shfzh4 <- as.numeric(substr(bind1$id_no,1,4))
datashfzh <- read_excel('datashfzh.xlsx')
bind1 <- left_join(bind1,datashfzh,by=c('shfzh4'='shfzh4'))
bind1$city.x <- gsub(pattern="市", replacement="", bind1$city.x)
bind1$province.y <- gsub(pattern="省", replacement="", bind1$province.y)
bind1$province.y <- gsub(pattern="[0-9]", replacement="", bind1$province.y)
unique(bind1$city.x)
bind1$shhjpd <- bind1$city.x == bind1$city.y
bind1$shhjpd[which(bind1$shhjpd == TRUE)] <- 1
bind1$shhjpd[which(bind1$shhjpd == FALSE)] <- 0
bind1$shhjpd[is.na(bind1$shhjpd)] <- 0
#####x8地址是否具体#####
bind1$dizhidetail <- 1
bind1$dizhidetail[which(bind1$detail1 %in% c('村','组','楼','屯','队'))] <- 0
bind1$dizhidetail[nchar(bind1$address) <= 15 ] <- 0
dataresult <- cbind(bind1$newaddress_type,bind1$year,bind1$sex,bind1$customer_name,bind1$bz,bind1$case_money,
bind1$hjshf,bind1$dizhidetail,bind1$cityornot,bind1$shhjpd,bind1$numyx)
dataresult <- data.frame(dataresult)
dataresult$address <- bind1$address
colnames(dataresult) <- c('地址类型','年龄','性别','客户类型','地址来源','欠款金额','家庭地址与户籍地址是否相同',
'地址是否具体','城市或者农村','身份证与户籍地址是否相同','地址是否含有数字','外访地址')
write.csv(dataresult,'特征数据.csv')
相关文章推荐
- Python爬虫/文本处理/科学计算/机器学习/数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱(转)
- 【Python】Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱(转)
- Python的网页爬虫&文本处理&科学计&机器学习&数据挖掘工具集
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器库
- [resource-]Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器谱
- R语言做文本挖掘 Part2分词处理
- python --网页爬虫,文本处理,科学计算,机器学习,数据挖掘资料+附带工具包下载
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器库
- Python 网页爬虫 & 文本处理 & 科学计算 & 机器学习 & 数据挖掘兵器库