python数据分析序列——DataFrame merge and apply
2017-03-09 09:10
801 查看
import cx_Oracle import pandas as pd import pandas.io.sql as psql
def fetch_all(sql,conn='user/pass@xxx.xx.x.xx:1521/x2'): db=cx_Oracle.connect(conn) df_ora=psql.read_sql(sql,con=db) db.close() return df_ora
sem_edu="user/pass@xxx.xx.x.xxx:1521/x1"
df_edu=fetch_all("select count(*) from ba.ba2010 ",sem_edu)
教育版人员数据总量
df_edu
这里输入代码
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>COUNT(*)</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>20507</td> </tr> </tbody> </table> </div>
df_pro=fetch_all("select count(*) from ba.ba2010 ")
正式版人员数据总量
df_pro
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>COUNT(*)</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>20394</td> </tr> </tbody> </table> </div>
df_edu=fetch_all("select emp_no emp_no1 from ba.ba2010 ",sem_edu)
df_pro=fetch_all("select emp_no emp_no2 from ba.ba2010 ")
df_edu.describe()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>EMP_NO1</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>20507</td> </tr> <tr> <th>unique</th> <td>20507</td> </tr> <tr> <th>top</th> <td>21206168</td> </tr> <tr> <th>freq</th> <td>1</td> </tr> </tbody> </table> </div>
df_pro.describe()
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>EMP_NO2</th> </tr> </thead> <tbody> <tr> <th>count</th> <td>20394</td> </tr> <tr> <th>unique</th> <td>20394</td> </tr> <tr> <th>top</th> <td>934471</td> </tr> <tr> <th>freq</th> <td>1</td> </tr> </tbody> </table> </div>通过merge操作将两个数据集合进行合并
df_merge=df_edu.merge(df_pro,how='outer',left_on='EMP_NO1',right_on="EMP_NO2")对空值进行填充处理
df_merge=df_merge.fillna('TUTUTU')筛选出两个集合的补集
df_merge_filter=df_merge[(df_merge.EMP_NO1=='TUTUTU') | (df_merge.EMP_NO2=='TUTUTU')]
fetch_all("select * from ba.ba2011 t where t.emp_no = '96128754'")
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>EMP_NO</th> <th>EMP_DESC</th> <th>ORGN_CD</th> <th>JO_NO</th> <th>JBRP</th> <th>JBL</th> <th>TRAD</th> <th>DIR_INDR_ID</th> <th>MH_APLY_PCNT</th> <th>MH_APLY_ST_DT</th> <th>MH_APLY_FN_DT</th> <th>RET_DT</th> <th>TEL_NO</th> <th>RGST_DT</th> <th>RGST_TIME</th> <th>RGST_USER_ID</th> <th>STUS_CD</th> </tr> </thead> <tbody> <tr> <th>0</th> <td>96128754</td> <td>???</td> <td>P35400</td> <td></td> <td>60</td> <td>80</td> <td>41</td> <td>D</td> <td>100</td> <td></td> <td></td> <td></td> <td></td> <td>20170308</td> <td>142650</td> <td>HR</td> <td>0</td> </tr> </tbody> </table> </div>
fetch_all("select * from ba.ba2011 t where t.emp_no = 'A875001'",sem_edu).T
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>0</th> </tr> </thead> <tbody> <tr> <th>EMP_NO</th> <td>A875001</td> </tr> <tr> <th>EMP_DESC</th> <td>???</td> </tr> <tr> <th>ORGN_CD</th> <td>P24307</td> </tr> <tr> <th>JO_NO</th> <td></td> </tr> <tr> <th>JBRP</th> <td>60</td> </tr> <tr> <th>JBL</th> <td>80</td> </tr> <tr> <th>TRAD</th> <td>53</td> </tr> <tr> <th>DIR_INDR_ID</th> <td>D</td> </tr> <tr> <th>MH_APLY_PCNT</th> <td>100</td> </tr> <tr> <th>MH_APLY_ST_DT</th> <td></td> </tr> <tr> <th>MH_APLY_FN_DT</th> <td></td> </tr> <tr> <th>RET_DT</th> <td></td> </tr> <tr> <th>TEL_NO</th> <td></td> </tr> <tr> <th>RGST_DT</th> <td>20170308</td> </tr> <tr> <th>RGST_TIME</th> <td>171936</td> </tr> <tr> <th>RGST_USER_ID</th> <td>IDM</td> </tr> <tr> <th>STUS_CD</th> <td>0</td> </tr> </tbody> </table> </div>
in_lizhidan="""988150 988156 988181 988158 988187 988001 988169 88893 85505 8Z0023 8Z0027 8Z0014 50223005 84571 84292 88531 85306 85520 8A0486 88840 922272 916800 922452 922604 922584 922220 922155 922601 919146 971222 971127 971310 971433 971385 971498 971446 971483 975326 975130 975347 939990 975229 975143 975028 88881 20001043 8A0874 21210014 J966334 J919047 81939 8A0455 919142 8A0041 8A0870 974332 953430 974711 974712 962077 978137 908439 908270 908205 978044 8A0563 88410 88444 89911 89943 89793 82976 8A0320 50207224 88253 953874 974572 974699 974412 974532 974702 953981 86980 88987 50223344 961789 961512 961400 961780 961717 961649 961735 961782 961550 961541""".split("\n")
df_merge_filter['IN_LIZHI']='N'
/usr/local/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy if __name__ == '__main__':
def has_lizhi(row): if row[0] in in_lizhidan: row[2]='Y' return row
df_lizhi=df_merge_filter.apply(has_lizhi,axis=1) df_lizhi
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>EMP_NO1</th> <th>EMP_NO2</th> <th>IN_LIZHI</th> </tr> </thead> <tbody> <tr> <th>10146</th> <td>94527750</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>17858</th> <td>94506706</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20395</th> <td>A920314</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20396</th> <td>85505</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20397</th> <td>86980</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20398</th> <td>84292</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20399</th> <td>85306</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20400</th> <td>84571</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20401</th> <td>82976</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20402</th> <td>916800</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20403</th> <td>81939</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20404</th> <td>85520</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20405</th> <td>953430</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20406</th> <td>962077</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20407</th> <td>922155</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20408</th> <td>922220</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20409</th> <td>922272</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20410</th> <td>J966334</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20411</th> <td>971127</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20412</th> <td>953874</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20413</th> <td>971222</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20414</th> <td>88253</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20415</th> <td>988158</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20416</th> <td>939990</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20417</th> <td>988169</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20418</th> <td>988001</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20419</th> <td>89793</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20420</th> <td>961717</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20421</th> <td>974332</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20422</th> <td>961550</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>...</th> <td>...</td> <td>...</td> <td>...</td> </tr> <tr> <th>20478</th> <td>919146</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20479</th> <td>88893</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20480</th> <td>A810695</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20481</th> <td>978044</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20482</th> <td>88881</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20483</th> <td>961735</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20484</th> <td>975347</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20485</th> <td>88987</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20486</th> <td>A910397</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20487</th> <td>971483</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20488</th> <td>A920302</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20489</th> <td>8A0874</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20490</th> <td>8Z0027</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20491</th> <td>8Z0023</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20492</th> <td>961780</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20493</th> <td>975326</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20494</th> <td>974699</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20495</th> <td>974711</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20496</th> <td>974712</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20497</th> <td>8Z0014</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20498</th> <td>974702</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20499</th> <td>971498</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20500</th> <td>A836893</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20501</th> <td>A800547</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20502</th> <td>21210014</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20503</th> <td>961541</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20504</th> <td>961789</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20505</th> <td>961782</td> <td>TUTUTU</td> <td>Y</td> </tr> <tr> <th>20506</th> <td>96128754</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20507</th> <td>TUTUTU</td> <td>A875001</td> <td>N</td> </tr> </tbody> </table> <p>115 rows × 3 columns</p> </div>
df_lizhi[df_lizhi.IN_LIZHI=='N']
<div> <table border="1" class="dataframe"> <thead> <tr style="text-align: right;"> <th></th> <th>EMP_NO1</th> <th>EMP_NO2</th> <th>IN_LIZHI</th> </tr> </thead> <tbody> <tr> <th>10146</th> <td>94527750</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>17858</th> <td>94506706</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20395</th> <td>A920314</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20424</th> <td>A920146</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20425</th> <td>91026733</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20426</th> <td>A920096</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20432</th> <td>A970202</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20445</th> <td>50226793</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20449</th> <td>A920031</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20452</th> <td>A350225</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20453</th> <td>A300117</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20465</th> <td>A920089</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20466</th> <td>A920128</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20480</th> <td>A810695</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20486</th> <td>A910397</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20488</th> <td>A920302</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20500</th> <td>A836893</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20501</th> <td>A800547</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20506</th> <td>96128754</td> <td>TUTUTU</td> <td>N</td> </tr> <tr> <th>20507</th> <td>TUTUTU</td> <td>A875001</td> <td>N</td> </tr> </tbody> </table> </div>
相关文章推荐
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- 用python做数据分析4|pandas库介绍之DataFrame基本操作 by 是蓝先生
- 利用python进行数据分析之dataframe的常见操作
- 数据结构之--series,DataFrame.use python and pandas for data mining
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- 量化投资与数据分析一: 如何用PYTHON下载WIND数据并转化成dataframe格式 分享
- python - pandas dataFrame merge 数据合并
- 利用Python进行数据分析(8) pandas基础: Series和DataFrame的基本操作
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- 用python做数据分析|pandas库:DataFrame基本操作
- 用python做数据分析4|pandas库介绍之DataFrame基本操作
- #######用python做数据分析4|pandas库介绍之DataFrame基本操作#######
- 利用python进行数据分析——data = {i : randn() for i in range(7))
- python pandas.DataFrame选取、修改数据最好用.loc,.iloc,.ix
- python 把几个DataFrame合并成一个DataFrame——merge,append,join,conca
- PyOdps DataFrame来临,数据分析从未如此简单!
- 利用python进行数据分析—数据清洗记录3,map,apply,
- 使用 MySQLdb 从数据库拉回数据,并将数据存入 python的DataFrame中