pandas常用操作
2015-07-20 21:01
471 查看
In [6]: dates = pd.date_range(’20130101’,periods=6) In [8]: df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list(’ABCD’)) In [9]: df Out[9]: A B C D 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 2013-01-06 -0.673690 0.113648 -1.478427 0.524988 df.values df.sort_index(axis=1, ascending=False) df.sort(columns=’B’) df[’A’] df[0:3] df[’20130102’:’20130104’] df.loc[:,[’A’,’B’]] df.loc[’20130102’:’20130104’,[’A’,’B’]] df.iloc[3] df.iloc[3:5,0:2] df.iloc[[1,2,4],[0,2]] df1 = df.reindex(index=dates[0:4],columns=list(df.columns) + [’E’]) df.mean() df.mean(1) df.apply(lambda x: x.max() - x.min())
In [77]: left = pd.DataFrame({’key’: [’foo’, ’foo’], ’lval’: [1, 2]}) In [78]: right = pd.DataFrame({’key’: [’foo’, ’foo’], ’rval’: [4, 5]}) In [79]: left Out[79]: key lval 0 foo 1 1 foo 2 In [80]: right Out[80]: key rval 0 foo 4 1 foo 5 In [81]: pd.merge(left, right, on=’key’) Out[81]: key lval rval 0 foo 1 4 1 foo 1 5 2 foo 2 4 3 foo 2 5
df.append(s, ignore_index=True)
In [87]: df Out[87]: A B C D 0 foo one -1.202872 -0.055224 1 bar one -1.814470 2.395985 2 foo two 1.018601 1.552825 3 bar three -0.595447 0.166599 4 foo two 1.395433 0.047609 5 bar two -0.392670 -0.136473 6 foo one 0.007207 -0.561757 7 foo three 1.928123 -1.623033 In [88]: df.groupby(’A’).sum() Out[88]: C D A bar -2.802588 2.42611 foo 3.146492 -0.63958 In [89]: df.groupby([’A’,’B’]).sum() Out[89]: C D A B bar one -1.814470 2.395985 three -0.595447 0.166599 two -0.392670 -0.136473 foo one -1.195665 -0.616981 three 1.928123 -1.623033 two 2.414034 1.600434
In [101]: df Out[101]: A B C D E 0 one A foo 1.418757 -0.179666 1 one B foo -1.879024 1.291836 2 two C foo 0.536826 -0.009614 3 three A bar 1.006160 0.392149 4 one B bar -0.029716 0.264599 5 one C bar -1.146178 -0.057409 6 two A foo 0.100900 -1.425638 7 three B foo -1.035018 1.024098 8 one C foo 0.314665 -0.106062 9 one A bar -0.773723 1.824375 10 two B bar -1.170653 0.595974 11 three C bar 0.648740 1.167115 In [102]: pd.pivot_table(df, values=’D’, index=[’A’, ’B’], columns=[’C’]) Out[102]: C bar foo A B one A -0.773723 1.418757 B -0.029716 -1.879024 C -1.146178 0.314665 three A 1.006160 NaN B NaN -1.035018 C 0.648740 NaN two A NaN 0.100900 B -1.170653 NaN C NaN 0.536826
# List to hold file names FileNames = [] # Your path will be different, please modify the path below. #os.chdir()函数功能:改变当前路径到指定路径 os.chdir(r"C:\Users\david\notebooks\pandas") # Find any file that ends with ".xlsx" for files in os.listdir("."): if files.endswith(".xlsx"): FileNames.append(files) FileNames def GetFile(fnombre): # Path to excel file # Your path will be different, please modify the path below. location = r'C:\Users\david\notebooks\pandas\\' + fnombre # Parse the excel file # 0 = first sheet df = pd.read_excel(location, 0) # Tag record to file name df['File'] = fnombre # Make the "File" column the index of the df return df.set_index(['File']) # Create a list of dataframes df_list = [GetFile(fname) for fname in FileNames] # Combine all of the dataframes into one big_df = pd.concat(df_list)
df One_X One_Y Two_X Two_Y row 0 1.1 1.2 1.11 1.22 0 1 1.1 1.2 1.11 1.22 1 2 1.1 1.2 1.11 1.22 2 In [60]: df = df.set_index(’row’);df Out[60]: One_X One_Y Two_X Two_Y row 0 1.1 1.2 1.11 1.22 1 1.1 1.2 1.11 1.22 2 1.1 1.2 1.11 1.22 In [61]: df.columns = pd.MultiIndex.from_tuples([tuple(c.split(’_’)) for c in df.columns]);df Out[61]: One Two X Y X Y row 0 1.1 1.2 1.11 1.22 1 1.1 1.2 1.11 1.22 2 1.1 1.2 1.11 1.22 In [62]: df = df.stack(0).reset_index(1);df Out[62]: level_1 X Y row 0 One 1.10 1.20 0 Two 1.11 1.22 1 One 1.10 1.20 1 Two 1.11 1.22 2 One 1.10 1.20 2 Two 1.11 1.22 In [63]: df.columns = [’Sample’,’All_X’,’All_Y’];df Out[63]: Sample All_X All_Y row 0 One 1.10 1.20 0 Two 1.11 1.22 1 One 1.10 1.20 1 Two 1.11 1.22 2 One 1.10 1.20 2 Two 1.11 1.22 In [64]: cols = pd.MultiIndex.from_tuples([ (x,y) for x in [’A’,’B’,’C’] for y in [’O’,’I’]]) In [65]: df = pd.DataFrame(np.random.randn(2,6),index=[’n’,’m’],columns=cols); df Out[65]: A B C O I O I O I n 1.920906 -0.388231 -2.314394 0.665508 0.402562 0.399555 m -1.765956 0.850423 0.388054 0.992312 0.744086 -0.739776 In [72]: index = list(itertools.product([’Ada’,’Quinn’,’Violet’],[’Comp’,’Math’,’Sci’])) In [73]: headr = list(itertools.product([’Exams’,’Labs’],[’I’,’II’])) In [74]: indx = pd.MultiIndex.from_tuples(index,names=[’Student’,’Course’]) In [75]: cols = pd.MultiIndex.from_tuples(headr) #Notice these are un-named In [76]: data = [[70+x+y+(x*y)%3 for x in range(4)] for y in range(9)] In [77]: df = pd.DataFrame(data,indx,cols); df Out[77]: Exams Labs I II I II Student Course Ada Comp 70 71 72 73 Math 71 73 75 74 Sci 72 75 75 75 Quinn Comp 73 74 75 76 Math 74 76 78 77 Sci 75 78 78 78 Violet Comp 76 77 78 79 Math 77 79 81 80 Sci 78 81 81 81 In [78]: All = slice(None) In [79]: df.loc[’Violet’] Out[79]: Exams Labs I II I II Course Comp 76 77 78 79 Math 77 79 81 80 Sci 78 81 81 81 In [80]: df.loc[(All,’Math’),All] Out[80]: Exams Labs I II I II Student Course Ada Mat 11ee2 h 71 73 75 74 Quinn Math 74 76 78 77 Violet Math 77 79 81 80 In [81]: df.loc[(slice(’Ada’,’Quinn’),’Math’),All] Out[81]: Exams Labs I II I II Student Course Ada Math 71 73 75 74 Quinn Math 74 76 78 77 In [82]: df.loc[(All,’Math’),(’Exams’)] Out[82]: I II Student Course Ada Math 71 73 Quinn Math 74 76 Violet Math 77 79 In [83]: df.loc[(All,’Math’),(All,’II’)] Out[83]: Exams Labs II II Student Course Ada Math 73 74 Quinn Math 76 77 Violet Math 79 80
df = pd.DataFrame({’animal’: ’cat dog cat fish dog cat cat’.split(), ....: ’size’: list(’SSMMMLL’), ....: ’weight’: [8, 10, 11, 1, 20, 12, 12], ....: ’adult’ : [False] * 5 + [True] * 2}); df adult animal size weight 0 False cat S 8 1 False dog S 10 2 False cat M 11 3 False fish M 1 4 False dog M 20 5 True cat L 12 6 True cat L 12 #List the size of the animals with the highest weight. In [90]: df.groupby(’animal’).apply(lambda subf: subf[’size’][subf[’weight’].idxmax()]) Out[90]: animal cat L dog M fish M dtype: object In [91]: gb = df.groupby([’animal’]) In [92]: gb.get_group(’cat’) Out[92]: adult animal size weight 0 False cat S 8 2 False cat M 11 5 True cat L 12 6 True cat L 12 Apply to different items in a group In [93]: def GrowUp(x): ....: avg_weight = sum(x[x.size == ’S’].weight * 1.5) ....: avg_weight += sum(x[x.size == ’M’].weight * 1.25) ....: avg_weight += sum(x[x.size == ’L’].weight) ....: avg_weight = avg_weight / len(x) ....: return pd.Series([’L’,avg_weight,True], index=[’size’, ’weight’, ’adult’]) ....: In [94]: expected_df = gb.apply(GrowUp) In [95]: expected_df Out[95]: size weight adult animal cat L 12.4375 True dog L 20.0000 True fish L 1.2500 True Create a value counts column and reassign back to the DataFrame In [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(), .....: ’Value’: [100, 150, 50, 50]}); df .....: Out[115]: Color Value 0 Red 100 1 Red 150 2 Red 50 3 Blue 50 In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len) In [117]: df Out[117]: Color Value Counts 0 Red 100 3 1 Red 150 3 2 Red 50 3 3 Blue 50 1 Rolling Apply to multiple columns where function returns a Scalar (Volume Weighted Average Price) In [138]: rng = pd.date_range(start = ’2014-01-01’,periods = 100) In [139]: df = pd.DataFrame({’Open’ : np.random.randn(len(rng)), .....: ’Close’ : np.random.randn(len(rng)),’Volume’ : np.random.randint(100,2000,len(rng))}, index=rng) df Close Open Volume 2014-01-01 1.550590 0.458513 1371 2014-01-02 -0.818812 -0.508850 1433 2014-01-03 1.160619 0.257610 645 2014-01-04 0.081521 -1.773393 878 2014-01-05 1.083284 -0.560676 1143 2014-01-06 -0.518721 0.284174 1088 2014-01-07 0.140661 1.146889 1722 ... ... ... ... 2014-04-04 0.458193 -0.669474 1768 2014-04-05 0.108502 -1.616315 836 2014-04-06 1.418082 -1.294906 694 2014-04-07 0.486530 1.171647 796 2014-04-08 0.181885 0.501639 265 2014-04-09 -0.707238 -0.361868 1293 2014-04-10 1.211432 1.564429 1088 In [140]: def vwap(bars): return ((bars.Close*bars.Volume).sum()/bars.Volume.sum()).round(2) In [141]: window = 5 In [142]: s = pd.concat([ (pd.Series(vwap(df.iloc[i:i+window]), index=[df.index[i+window]])) for i in range(95)]) Out[142]: 2014-01-06 0.55 2014-01-07 0.06 2014-01-08 0.32 2014-01-09 0.03 2014-01-10 0.08 ... 2014-04-05 0.48 2014-04-06 0.54 2014-04-07 0.46 2014-04-08 0.45 2014-04-09 0.53 2014-04-10 0.15 Length: 95
Replacing some values with mean of the rest of a group In [100]: df = pd.DataFrame({’A’ : [1, 1, 2, 2], ’B’ : [1, -1, 1, 2]}) In [101]: gb = df.groupby(’A’) In [102]: def replace(g): .....: mask = g < 0 .....: g.loc[mask] = g[~mask].mean() .....: return g In [103]: gb.transform(replace) Out[103]: B 0 1 1 1 2 1 3 2 Sort groups by aggregated data In [104]: df = pd.DataFrame({’code’: [’foo’, ’bar’, ’baz’] * 2, .....: ’data’: [0.16, -0.21, 0.33, 0.45, -0.59, 0.62], .....: ’flag’: [False, True] * 3}) .....: In [105]: code_groups = df.groupby(’code’) In [106]: agg_n_sort_order = code_groups[[’data’]].transform(sum).sort(’data’) In [107]: sorted_df = df.ix[agg_n_sort_order.index] In [108]: sorted_df Out[108]: code data flag 1 bar -0.21 True 4 bar -0.59 False 0 foo 0.16 False 3 foo 0.45 True 2 baz 0.33 False 5 baz 0.62 True Create multiple aggregated columns In [109]: rng = pd.date_range(start="2014-10-07",periods=10,freq=’2min’) In [110]: ts = pd.Series(data = list(range(10)), index = rng) In [111]: def MyCust(x): .....: if len(x) > 2: .....: return x[1] * 1.234 .....: return pd.NaT In [112]: mhc = {’Mean’ : np.mean, ’Max’ : np.max, ’Custom’ : MyCust} In [113]: ts.resample("5min",how = mhc) Out[113]: Max Custom Mean 2014-10-07 00:00:00 2 1.234 1.0 2014-10-07 00:05:00 4 NaN 3.5 2014-10-07 00:10:00 7 7.404 6.0 2014-10-07 00:15:00 9 NaN 8.5 In [114]: ts Out[114]: 2014-10-07 00:00:00 0 2014-10-07 00:02:00 1 2014-10-07 00:04:00 2 2014-10-07 00:06:00 3 2014-10-07 00:08:00 4 2014-10-07 00:10:00 5 2014-10-07 00:12:00 6 2014-10-07 00:14:00 7 2014-10-07 00:16:00 8 2014-10-07 00:18:00 9 Freq: 2T, dtype: int64 Create a value counts column and reassign back to the DataFrame In [115]: df = pd.DataFrame({’Color’: ’Red Red Red Blue’.split(), .....: ’Value’: [100, 150, 50, 50]}); df .....: Out[115]: Color Value 0 Red 100 1 Red 150 2 Red 50 3 Blue 50 In [116]: df[’Counts’] = df.groupby([’Color’]).transform(len) In [117]: df Out[117]: Color Value Counts 0 Red 100 3 1 Red 150 3 2 Red 50 3 3 Blue 50 1
In [30]: i = pd.date_range(’20000101’,periods=10000) In [31]: df = pd.DataFrame(dict(year = i.year, month = i.month, day = i.day)) In [32]: df.head() Out[32]: day month year 0 1 1 2000 1 2 1 2000 2 3 1 2000 3 4 1 2000 4 5 1 2000 In [34]: ds = df.apply(lambda x: "%04d%02d%02d" % (x[’year’],x[’month’],x[’day’]),axis=1) In [35]: ds.head() Out[35]: 0 20000101 1 20000102 2 20000103 3 20000104 4 20000105 dtype: object
In [180]: def expand_grid(data_dict): .....: rows = itertools.product(*data_dict.values()) .....: return pd.DataFrame.from_records(rows, columns=data_dict.keys()) In [181]: df = expand_grid( .....: {’height’: [60, 70], .....: ’weight’: [100, 140, 180], .....: ’sex’: [’Male’, ’Female’]}) Out[182]: sex weight height 0 Male 100 60 1 Male 100 70 2 Male 140 60 3 Male 140 70 4 Male 180 60 5 Male 180 70 6 Female 100 60 7 Female 100 70 8 Female 140 60 9 Female 140 70 10 Female 180 60 11 Female 180 70
In [43]: DataFrame(data) Out[43]: A B C 0 1 2 Hello 1 2 3 World In [44]: DataFrame(data, index=[’first’, ’second’]) Out[44]: A B C first 1 2 Hello second 2 3 World In [45]: DataFrame(data, columns=[’C’, ’A’, ’B’]) Out[45]: C A B 0 Hello 1 2 1 World 2 3
相关文章推荐
- js动画2 多物体运动
- poj3735 Training little cats
- Android AIDL跨进程通讯
- 编写一个函数将数组中的数字前后颠倒,并输出到屏幕上来。
- Find Minimum in Rotated Sorted Array
- POJ 3304 Segment
- 实习第四天——安卓开发中遇到的问题
- 一致性哈希算法
- 转:我们是否应该把后端构建为API
- ubuntu中文论坛
- linux在下面APK反编译软件和过程的描述
- HDUOJ1091
- C语言用结构体模拟类的功能
- HDU 1735 字数统计
- [DFS] AOJ 0118 Property Distribution
- strstr函数的被查找字符串与查找到的字符串是互相影响的,同时可以只用i--控制循环不变
- 银联加密算法
- [c] HDOJ1053 哈夫曼树的应用
- 1015. Reversible Primes (20)
- GNU LIBC源码学习之strcmp