>>> df.iloc[3:5,0:2] A B2013-01-04 0.721555 -0.7067712013-01-05 -0.424972 0.567020>>> df.iloc[[1,2,4],[0,2]] A C2013-01-02 1.212112 0.1192092013-01-03 -0.861849 -0.4949292013-01-05 -0.424972 0.276232
関係演算子を使った指定
数値の大小で選択
>>> df[df.A > 0] A B C D2013-01-01 0.469112 -0.282863 -1.509059 -1.1356322013-01-02 1.212112 -0.173215 0.119209 -1.0442362013-01-04 0.721555 -0.706771 -1.039575 0.271860>>> df[df > 0] A B C D2013-01-01 0.469112 NaN NaN NaN2013-01-02 1.212112 NaN 0.119209 NaN2013-01-03 NaN NaN NaN 1.0718042013-01-04 0.721555 NaN NaN 0.2718602013-01-05 NaN 0.567020 0.276232 NaN2013-01-06 NaN 0.113648 NaN 0.524988
フィルタを使った指定
文字列でフィルタをかける(isin)。文字通りexcelのフィルタのような処理ができます。
>>> df2 = df.copy()>>> df2['E'] = ['one', 'one','two','three','four','three']>>> df2 A B C D E2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 one2013-01-02 1.212112 -0.173215 0.119209 -1.044236 one2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two2013-01-04 0.721555 -0.706771 -1.039575 0.271860 three2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four2013-01-06 -0.673690 0.113648 -1.478427 0.524988 three#ここまでデータ準備>>> df2[df2['E'].isin(['two','four'])] A B C D E2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 two2013-01-05 -0.424972 0.567020 0.276232 -1.087401 four
>>> df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])>>> df1.loc[dates[0]:dates[1],'E'] = 1>>> df1 A B C D F E2013-01-01 0.000000 0.000000 -1.509059 5 NaN 1.02013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.02013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 NaN2013-01-04 0.721555 -0.706771 -1.039575 5 3.0 NaN#ここまでデータ準備>>> df1.dropna(how='any') A B C D F E2013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.0
欠損値を指定値で埋めて出力
>>> df1.fillna(value=5) A B C D F E2013-01-01 0.000000 0.000000 -1.509059 5 5.0 1.02013-01-02 1.212112 -0.173215 0.119209 5 1.0 1.02013-01-03 -0.861849 -2.104569 -0.494929 5 2.0 5.02013-01-04 0.721555 -0.706771 -1.039575 5 3.0 5.0
欠損値であるかないかbooleanで判定
>>> pd.isna(df1) A B C D F E2013-01-01 False False False False True False2013-01-02 False False False False False False2013-01-03 False False False False False True2013-01-04 False False False False False True
>>> df.sub(s, axis='index') A B C D F2013-01-01 NaN NaN NaN NaN NaN2013-01-02 NaN NaN NaN NaN NaN2013-01-03 -1.861849 -3.104569 -1.494929 4.0 1.02013-01-04 -2.278445 -3.706771 -4.039575 2.0 0.02013-01-05 -5.424972 -4.432980 -4.723768 0.0 -1.02013-01-06 NaN NaN NaN NaN NaN
>>> left = pd.DataFrame({'key': ['foo', 'bar'], 'lval': [1, 2]})>>> right = pd.DataFrame({'key': ['foo', 'bar'], 'rval': [4, 5]})>>> left key lval0 foo 11 bar 2>>> right key rval0 foo 41 bar 5>>> pd.merge(left, right, on='key') key lval rval0 foo 1 41 bar 2 5
データのグループ化
データの集計
>>> df = pd.DataFrame({ 'A' : ['foo', 'bar', 'foo', 'bar','foo', 'bar', 'foo', 'foo'], 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'], 'C' : np.random.randn(8), 'D' : np.random.randn(8)})>>> df A B C D0 foo one -1.202872 -0.0552241 bar one -1.814470 2.3959852 foo two 1.018601 1.5528253 bar three -0.595447 0.1665994 foo two 1.395433 0.0476095 bar two -0.392670 -0.1364736 foo one 0.007207 -0.5617577 foo three 1.928123 -1.623033#ここまでデータ準備>>> df.groupby('A').sum() #A列の文字列で数値をグループ化 C DA bar -2.802588 2.42611foo 3.146492 -0.63958>>> df.groupby(['A','B']).sum() #A、B列の両方の文字列で数値をグループ化 C DA B bar one -1.814470 2.395985 three -0.595447 0.166599 two -0.392670 -0.136473foo one -1.195665 -0.616981 three 1.928123 -1.623033 two 2.414034 1.600434
>>> tuples = list(zip(*[ ['bar', 'bar', 'baz', 'baz','foo', 'foo', 'qux', 'qux'], ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two'] ]))>>> index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])>>> df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])>>> df2 = df[:4]>>> df2 A Bfirst second bar one 0.029399 -0.542108 two 0.282696 -0.087302baz one -1.575170 1.771208 two 0.816482 1.100230>>> stacked = df2.stack()>>> stackedfirst second bar one A 0.029399 B -0.542108 two A 0.282696 B -0.087302baz one A -1.575170 B 1.771208 two A 0.816482 B 1.100230dtype: float64>>> stacked.unstack() A Bfirst second bar one 0.029399 -0.542108 two 0.282696 -0.087302baz one -1.575170 1.771208 two 0.816482 1.100230
>>> df = pd.DataFrame({ 'A' : ['one', 'one', 'two', 'three'] * 3, 'B' : ['A', 'B', 'C'] * 4, 'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2, 'D' : np.random.randn(12), 'E' : np.random.randn(12)})>>> df A B C D E0 one A foo 1.418757 -0.1796661 one B foo -1.879024 1.2918362 two C foo 0.536826 -0.0096143 three A bar 1.006160 0.3921494 one B bar -0.029716 0.2645995 one C bar -1.146178 -0.0574096 two A foo 0.100900 -1.4256387 three B foo -1.035018 1.0240988 one C foo 0.314665 -0.1060629 one A bar -0.773723 1.82437510 two B bar -1.170653 0.59597411 three C bar 0.648740 1.167115#ここまでデータ準備>>> pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])C bar fooA B one A -0.773723 1.418757 B -0.029716 -1.879024 C -1.146178 0.314665three A 1.006160 NaN B NaN -1.035018 C 0.648740 NaNtwo A NaN 0.100900 B -1.170653 NaN C NaN 0.536826