登录        注册
 

使用Pandas总结

后端 root 59浏览 0评论

title: 关于Pandas基本操作总结 date: 2016-08-11 13:20:15 tags: - 数据分析 - 十分钟入门Pandas categories: Pandas


In [1]:

%matplotlib inline

In [2]:

import pandas as pd

In [3]:

import numpy as np

In [4]:

import matplotlib.pyplot as plt

In [5]:

s = pd.Series([1,3,5,np.nan,6,8])

In [6]:

s

Out[6]:

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:

dates = pd.date_range('20130101', periods=6)

In [8]:

dates

Out[8]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [10]:

df

Out[10]:

A B C D
2013-01-01 -1.234543 -0.181094 0.602767 0.024092
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272
2013-01-03 0.108684 -0.698606 0.294223 0.779937
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424
2013-01-05 1.586294 -0.369570 0.887738 -0.067130
2013-01-06 0.603919 -0.292847 -0.351932 -0.546226

In [11]:

df2 = pd.DataFrame({'A' : 1., 
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'}) 

In [12]:

df2

Out[12]:

A B C D E F
0 1.0 2013-01-02 1.0 3 test foo
1 1.0 2013-01-02 1.0 3 train foo
2 1.0 2013-01-02 1.0 3 test foo
3 1.0 2013-01-02 1.0 3 train foo

In [13]:

df2.dtypes

Out[13]:

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:

df.head()

Out[14]:

A B C D
2013-01-01 -1.234543 -0.181094 0.602767 0.024092
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272
2013-01-03 0.108684 -0.698606 0.294223 0.779937
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424
2013-01-05 1.586294 -0.369570 0.887738 -0.067130

In [15]:

df.tail(3)

Out[15]:

A B C D
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424
2013-01-05 1.586294 -0.369570 0.887738 -0.067130
2013-01-06 0.603919 -0.292847 -0.351932 -0.546226

In [16]:

df.index

Out[16]:

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [17]:

df.columns

Out[17]:

Index([u'A', u'B', u'C', u'D'], dtype='object')

In [18]:

df.values

Out[18]:

array([[-1.23454325, -0.18109375,  0.60276697,  0.02409219],
       [-1.01234439, -1.74766336, -0.44616522, -0.30827222],
       [ 0.10868385, -0.69860582,  0.29422346,  0.77993654],
       [ 0.87005021, -0.16265896, -1.84247599, -0.82842427],
       [ 1.58629419, -0.36956975,  0.88773803, -0.06712956],
       [ 0.60391916, -0.29284674, -0.35193192, -0.54622645]])

In [19]:

df.describe()

Out[19]:

A B C D
count 6.000000 6.000000 6.000000 6.000000
mean 0.153677 -0.575406 -0.142641 -0.157671
std 1.100644 0.606204 0.983212 0.555681
min -1.234543 -1.747663 -1.842476 -0.828424
25% -0.732087 -0.616347 -0.422607 -0.486738
50% 0.356302 -0.331208 -0.028854 -0.187701
75% 0.803517 -0.209032 0.525631 0.001287
max 1.586294 -0.162659 0.887738 0.779937

In [20]:

df.T

Out[20]:

2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A -1.234543 -1.012344 0.108684 0.870050 1.586294 0.603919
B -0.181094 -1.747663 -0.698606 -0.162659 -0.369570 -0.292847
C 0.602767 -0.446165 0.294223 -1.842476 0.887738 -0.351932
D 0.024092 -0.308272 0.779937 -0.828424 -0.067130 -0.546226

In [21]:

df.sort_index(axis=1, ascending=False)

Out[21]:

D C B A
2013-01-01 0.024092 0.602767 -0.181094 -1.234543
2013-01-02 -0.308272 -0.446165 -1.747663 -1.012344
2013-01-03 0.779937 0.294223 -0.698606 0.108684
2013-01-04 -0.828424 -1.842476 -0.162659 0.870050
2013-01-05 -0.067130 0.887738 -0.369570 1.586294
2013-01-06 -0.546226 -0.351932 -0.292847 0.603919

In [22]:

df.sort_values(by='B')

Out[22]:

A B C D
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272
2013-01-03 0.108684 -0.698606 0.294223 0.779937
2013-01-05 1.586294 -0.369570 0.887738 -0.067130
2013-01-06 0.603919 -0.292847 -0.351932 -0.546226
2013-01-01 -1.234543 -0.181094 0.602767 0.024092
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424

In [23]:

df['A']

Out[23]:

2013-01-01   -1.234543
2013-01-02   -1.012344
2013-01-03    0.108684
2013-01-04    0.870050
2013-01-05    1.586294
2013-01-06    0.603919
Freq: D, Name: A, dtype: float64

In [24]:

df[0:3]

Out[24]:

A B C D
2013-01-01 -1.234543 -0.181094 0.602767 0.024092
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272
2013-01-03 0.108684 -0.698606 0.294223 0.779937

In [25]:

df['20130102':'20130104']

Out[25]:

A B C D
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272
2013-01-03 0.108684 -0.698606 0.294223 0.779937
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424

In [26]:

df.loc[dates[0]]

Out[26]:

A   -1.234543
B   -0.181094
C    0.602767
D    0.024092
Name: 2013-01-01 00:00:00, dtype: float64

In [27]:

df.loc[:,['A','B']]

Out[27]:

A B
2013-01-01 -1.234543 -0.181094
2013-01-02 -1.012344 -1.747663
2013-01-03 0.108684 -0.698606
2013-01-04 0.870050 -0.162659
2013-01-05 1.586294 -0.369570
2013-01-06 0.603919 -0.292847

In [28]:

df.loc['20130102':'20130104',['A','B']]

Out[28]:

A B
2013-01-02 -1.012344 -1.747663
2013-01-03 0.108684 -0.698606
2013-01-04 0.870050 -0.162659

In [29]:

df.loc['20130102',['A','B']]

Out[29]:

A   -1.012344
B   -1.747663
Name: 2013-01-02 00:00:00, dtype: float64

In [30]:

df.loc[dates[0],'A']

Out[30]:

-1.2345432480819052

In [31]:

df.at[dates[0],'A']

Out[31]:

-1.2345432480819052

In [32]:

df.iloc[3]

Out[32]:

A    0.870050
B   -0.162659
C   -1.842476
D   -0.828424
Name: 2013-01-04 00:00:00, dtype: float64

In [33]:

df.iloc[3:5,0:2]

Out[33]:

A B
2013-01-04 0.870050 -0.162659
2013-01-05 1.586294 -0.369570

In [34]:

df.iloc[[1,2,4],[0,2]]

Out[34]:

A C
2013-01-02 -1.012344 -0.446165
2013-01-03 0.108684 0.294223
2013-01-05 1.586294 0.887738

In [35]:

df.iloc[1:3,:]

Out[35]:

A B C D
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272
2013-01-03 0.108684 -0.698606 0.294223 0.779937

In [36]:

df.iloc[:,1:3]

Out[36]:

B C
2013-01-01 -0.181094 0.602767
2013-01-02 -1.747663 -0.446165
2013-01-03 -0.698606 0.294223
2013-01-04 -0.162659 -1.842476
2013-01-05 -0.369570 0.887738
2013-01-06 -0.292847 -0.351932

In [37]:

df.iloc[1,1]

Out[37]:

-1.7476633559261565

In [38]:

df.iat[1,1]

Out[38]:

-1.7476633559261565

In [39]:

df[df.A>0]

Out[39]:

A B C D
2013-01-03 0.108684 -0.698606 0.294223 0.779937
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424
2013-01-05 1.586294 -0.369570 0.887738 -0.067130
2013-01-06 0.603919 -0.292847 -0.351932 -0.546226

In [40]:

df[df > 0]

Out[40]:

A B C D
2013-01-01 NaN NaN 0.602767 0.024092
2013-01-02 NaN NaN NaN NaN
2013-01-03 0.108684 NaN 0.294223 0.779937
2013-01-04 0.870050 NaN NaN NaN
2013-01-05 1.586294 NaN 0.887738 NaN
2013-01-06 0.603919 NaN NaN NaN

In [41]:

df2 = df.copy()

In [42]:

df2['E'] = ['one','one','two','three','four','three']

In [43]:

df2

Out[43]:

A B C D E
2013-01-01 -1.234543 -0.181094 0.602767 0.024092 one
2013-01-02 -1.012344 -1.747663 -0.446165 -0.308272 one
2013-01-03 0.108684 -0.698606 0.294223 0.779937 two
2013-01-04 0.870050 -0.162659 -1.842476 -0.828424 three
2013-01-05 1.586294 -0.369570 0.887738 -0.067130 four
2013-01-06 0.603919 -0.292847 -0.351932 -0.546226 three

In [44]:

df2[df2['E'].isin(['two','four'])]

Out[44]:

A B C D E
2013-01-03 0.108684 -0.698606 0.294223 0.779937 two
2013-01-05 1.586294 -0.369570 0.887738 -0.067130 four

In [45]:

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

In [46]:

s1

Out[46]:

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

In [47]:

df['F'] = s1

In [48]:

df.at[dates[0],'A'] = 0

In [49]:

df.iat[0,1] = 0

In [50]:

df.loc[:,'D'] = np.array([5] * len(df))

In [51]:

df

Out[51]:

A B C D F
2013-01-01 0.000000 0.000000 0.602767 5 NaN
2013-01-02 -1.012344 -1.747663 -0.446165 5 1.0
2013-01-03 0.108684 -0.698606 0.294223 5 2.0
2013-01-04 0.870050 -0.162659 -1.842476 5 3.0
2013-01-05 1.586294 -0.369570 0.887738 5 4.0
2013-01-06 0.603919 -0.292847 -0.351932 5 5.0

In [52]:

df2 = df.copy()

In [53]:

df2[df2 > 0] = -df2

In [54]:

df2

Out[54]:

A B C D F
2013-01-01 0.000000 0.000000 -0.602767 -5 NaN
2013-01-02 -1.012344 -1.747663 -0.446165 -5 -1.0
2013-01-03 -0.108684 -0.698606 -0.294223 -5 -2.0
2013-01-04 -0.870050 -0.162659 -1.842476 -5 -3.0
2013-01-05 -1.586294 -0.369570 -0.887738 -5 -4.0
2013-01-06 -0.603919 -0.292847 -0.351932 -5 -5.0

In [55]:

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

In [56]:

df1.loc[dates[0]:dates[1],'E'] = 1

In [57]:

df1

Out[57]:

A B C D F E
2013-01-01 0.000000 0.000000 0.602767 5 NaN 1.0
2013-01-02 -1.012344 -1.747663 -0.446165 5 1.0 1.0
2013-01-03 0.108684 -0.698606 0.294223 5 2.0 NaN
2013-01-04 0.870050 -0.162659 -1.842476 5 3.0 NaN

In [58]:

df1.dropna(how='any')

Out[58]:

A B C D F E
2013-01-02 -1.012344 -1.747663 -0.446165 5 1.0 1.0

In [59]:

df1.fillna(value=5)

Out[59]:

A B C D F E
2013-01-01 0.000000 0.000000 0.602767 5 5.0 1.0
2013-01-02 -1.012344 -1.747663 -0.446165 5 1.0 1.0
2013-01-03 0.108684 -0.698606 0.294223 5 2.0 5.0
2013-01-04 0.870050 -0.162659 -1.842476 5 3.0 5.0

In [60]:

pd.isnull(df1)

Out[60]:

A B C D F E
2013-01-01 False False False False True False
2013-01-02 False False False False False False
2013-01-03 False False False False False True
2013-01-04 False False False False False True

In [61]:

df.mean()

Out[61]:

A    0.359434
B   -0.545224
C   -0.142641
D    5.000000
F    3.000000
dtype: float64

In [62]:

df.mean(1)

Out[62]:

2013-01-01    1.400692
2013-01-02    0.558765
2013-01-03    1.340860
2013-01-04    1.372983
2013-01-05    2.220892
2013-01-06    1.991828
Freq: D, dtype: float64

In [63]:

s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

In [64]:

s

Out[64]:

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [65]:

df.sub(s, axis='index')

Out[65]:

A B C D F
2013-01-01 NaN NaN NaN NaN NaN
2013-01-02 NaN NaN NaN NaN NaN
2013-01-03 -0.891316 -1.698606 -0.705777 4.0 1.0
2013-01-04 -2.129950 -3.162659 -4.842476 2.0 0.0
2013-01-05 -3.413706 -5.369570 -4.112262 0.0 -1.0
2013-01-06 NaN NaN NaN NaN NaN

In [66]:

df.apply(np.cumsum)

Out[66]:

A B C D F
2013-01-01 0.000000 0.000000 0.602767 5 NaN
2013-01-02 -1.012344 -1.747663 0.156602 10 1.0
2013-01-03 -0.903661 -2.446269 0.450825 15 3.0
2013-01-04 -0.033610 -2.608928 -1.391651 20 6.0
2013-01-05 1.552684 -2.978498 -0.503913 25 10.0
2013-01-06 2.156603 -3.271345 -0.855845 30 15.0

In [67]:

df.apply(lambda x: x.max() - x.min())

Out[67]:

A    2.598639
B    1.747663
C    2.730214
D    0.000000
F    4.000000
dtype: float64

In [68]:

s = pd.Series(np.random.randint(0, 7, size=10))

In [69]:

s

Out[69]:

0    1
1    3
2    1
3    6
4    6
5    6
6    2
7    6
8    1
9    2
dtype: int32

In [70]:

s.value_counts()

Out[70]:

6    4
1    3
2    2
3    1
dtype: int64

In [71]:

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

In [72]:

s.str.lower()

Out[72]:

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

In [73]:

df = pd.DataFrame(np.random.randn(10, 4))

In [74]:

df

Out[74]:

0 1 2 3
0 0.087190 0.378057 1.230033 -1.916034
1 1.845394 -0.493289 1.341151 -0.312285
2 -1.414530 0.567152 -1.266626 -0.824010
3 0.085407 1.743999 1.059148 0.967338
4 1.968624 -0.456669 -1.170885 0.144196
5 -0.095139 -0.307934 1.317358 -0.744105
6 -1.557237 -1.185375 0.840714 1.070566
7 0.049833 -0.967053 1.201201 -0.469980
8 0.782234 0.027085 0.573083 1.116846
9 1.293508 1.188724 -1.044702 -0.973416

In [75]:

pieces = [df[:3], df[3:7], df[7:]]

In [76]:

pd.concat(pieces)

Out[76]:

0 1 2 3
0 0.087190 0.378057 1.230033 -1.916034
1 1.845394 -0.493289 1.341151 -0.312285
2 -1.414530 0.567152 -1.266626 -0.824010
3 0.085407 1.743999 1.059148 0.967338
4 1.968624 -0.456669 -1.170885 0.144196
5 -0.095139 -0.307934 1.317358 -0.744105
6 -1.557237 -1.185375 0.840714 1.070566
7 0.049833 -0.967053 1.201201 -0.469980
8 0.782234 0.027085 0.573083 1.116846
9 1.293508 1.188724 -1.044702 -0.973416

In [77]:

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

In [78]:

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

In [79]:

left

Out[79]:

key lval
0 foo 1
1 foo 2

In [80]:

right

Out[80]:

key rval
0 foo 4
1 foo 5

In [81]:

pd.merge(left, right, on='key')

Out[81]:

key lval rval
0 foo 1 4
1 foo 1 5
2 foo 2 4
3 foo 2 5

In [82]:

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

In [83]:

df

Out[83]:

A B C D
0 -1.137633 1.190761 -1.589958 0.274725
1 -1.182860 -0.964490 0.701884 -0.209471
2 1.708852 -0.926927 -1.044458 -1.501952
3 0.956593 0.605129 0.929932 0.382138
4 0.443152 0.795685 -1.092606 0.384814
5 0.570427 0.134011 0.425411 0.672966
6 0.557559 -1.400583 1.328377 -1.237753
7 2.263220 0.883696 1.772972 -1.388512

In [84]:

s = df.iloc[3]

In [85]:

df.append(s, ignore_index=True)

Out[85]:

A B C D
0 -1.137633 1.190761 -1.589958 0.274725
1 -1.182860 -0.964490 0.701884 -0.209471
2 1.708852 -0.926927 -1.044458 -1.501952
3 0.956593 0.605129 0.929932 0.382138
4 0.443152 0.795685 -1.092606 0.384814
5 0.570427 0.134011 0.425411 0.672966
6 0.557559 -1.400583 1.328377 -1.237753
7 2.263220 0.883696 1.772972 -1.388512
8 0.956593 0.605129 0.929932 0.382138

In [86]:

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

In [87]:

df

Out[87]:

A B C D
0 foo one 0.135346 -0.916001
1 bar one -0.907362 0.550130
2 foo two -1.758074 0.264998
3 bar three 0.153849 0.566296
4 foo two -1.033947 0.551321
5 bar two 0.494596 -0.859861
6 foo one 0.347631 0.106605
7 foo three 0.785117 -0.138477

In [88]:

df.groupby('A').sum()

Out[88]:

C D
A
bar -0.258916 0.256565
foo -1.523927 -0.131554

In [89]:

df.groupby(['A','B']).sum()

Out[89]:

C D
A B
bar one -0.907362 0.550130
three 0.153849 0.566296
two 0.494596 -0.859861
foo one 0.482977 -0.809396
three 0.785117 -0.138477
two -2.792021 0.816319

In [90]:

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                      'one', 'two', 'one', 'two']]))

In [91]:

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

In [92]:

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

In [93]:

df2 = df[:4]

In [94]:

df2

Out[94]:

A B
first second
bar one -0.723323 0.071984
two -1.068435 -0.485561
baz one 0.018726 -0.379056
two 2.051579 -0.789019

In [95]:

stacked = df2.stack()

In [96]:

stacked

Out[96]:

first  second   
bar    one     A   -0.723323
               B    0.071984
       two     A   -1.068435
               B   -0.485561
baz    one     A    0.018726
               B   -0.379056
       two     A    2.051579
               B   -0.789019
dtype: float64

In [97]:

stacked.unstack()

Out[97]:

A B
first second
bar one -0.723323 0.071984
two -1.068435 -0.485561
baz one 0.018726 -0.379056
two 2.051579 -0.789019

In [98]:

stacked.unstack(1)

Out[98]:

second one two
first
bar A -0.723323 -1.068435
B 0.071984 -0.485561
baz A 0.018726 2.051579
B -0.379056 -0.789019

In [99]:

stacked.unstack(0)

Out[99]:

first bar baz
second
one A -0.723323 0.018726
B 0.071984 -0.379056
two A -1.068435 2.051579
B -0.485561 -0.789019

In [100]:

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

In [101]:

df

Out[101]:

A B C D E
0 one A foo -0.435141 -0.012559
1 one B foo 1.419821 1.676063
2 two C foo -0.844238 -0.248897
3 three A bar 0.291280 1.768196
4 one B bar -0.559366 -1.223321
5 one C bar -1.111149 0.599756
6 two A foo -0.980251 0.168256
7 three B foo 0.596242 -0.515708
8 one C foo -0.445593 0.528068
9 one A bar 0.344460 -0.199001
10 two B bar -1.455970 -0.209612
11 three C bar 0.406563 0.340673

In [102]:

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

Out[102]:

C bar foo
A B
one A 0.344460 -0.435141
B -0.559366 1.419821
C -1.111149 -0.445593
three A 0.291280 NaN
B NaN 0.596242
C 0.406563 NaN
two A NaN -0.980251
B -1.455970 NaN
C NaN -0.844238

In [103]:

rng = pd.date_range('1/1/2012', periods=100, freq='S')

In [104]:

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

In [105]:

ts.resample('5Min').sum()

Out[105]:

2012-01-01    26177
Freq: 5T, dtype: int32

In [106]:

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')

In [107]:

ts = pd.Series(np.random.randn(len(rng)), rng)

In [108]:

ts

Out[108]:

2012-03-06    1.964786
2012-03-07   -0.086269
2012-03-08    0.017538
2012-03-09   -0.128088
2012-03-10   -0.657546
Freq: D, dtype: float64

In [109]:

ts_utc = ts.tz_localize('UTC')

In [110]:

ts_utc

Out[110]:

2012-03-06 00:00:00+00:00    1.964786
2012-03-07 00:00:00+00:00   -0.086269
2012-03-08 00:00:00+00:00    0.017538
2012-03-09 00:00:00+00:00   -0.128088
2012-03-10 00:00:00+00:00   -0.657546
Freq: D, dtype: float64

In [111]:

ts_utc.tz_convert('US/Eastern')

Out[111]:

2012-03-05 19:00:00-05:00    1.964786
2012-03-06 19:00:00-05:00   -0.086269
2012-03-07 19:00:00-05:00    0.017538
2012-03-08 19:00:00-05:00   -0.128088
2012-03-09 19:00:00-05:00   -0.657546
Freq: D, dtype: float64

In [112]:

rng = pd.date_range('1/1/2012', periods=5, freq='M')

In [113]:

ts = pd.Series(np.random.randn(len(rng)), index=rng)

In [114]:

ts

Out[114]:

2012-01-31    0.558833
2012-02-29   -0.213529
2012-03-31   -0.934557
2012-04-30   -1.941257
2012-05-31   -0.705337
Freq: M, dtype: float64

In [115]:

ps = ts.to_period()

In [116]:

ps

Out[116]:

2012-01    0.558833
2012-02   -0.213529
2012-03   -0.934557
2012-04   -1.941257
2012-05   -0.705337
Freq: M, dtype: float64

In [117]:

ps.to_timestamp()

Out[117]:

2012-01-01    0.558833
2012-02-01   -0.213529
2012-03-01   -0.934557
2012-04-01   -1.941257
2012-05-01   -0.705337
Freq: MS, dtype: float64

In [118]:

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')

In [119]:

ts = pd.Series(np.random.randn(len(prng)), prng)

In [120]:

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

In [121]:

ts.head()

Out[121]:

1990-03-01 09:00    0.612225
1990-06-01 09:00    0.125501
1990-09-01 09:00   -0.647144
1990-12-01 09:00    0.858475
1991-03-01 09:00    0.408605
Freq: H, dtype: float64

In [122]:

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

In [123]:

df["grade"] = df["raw_grade"].astype("category")

In [124]:

df["grade"]

Out[124]:

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

In [125]:

df["grade"].cat.categories = ["very good", "good", "very bad"]

In [126]:

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

In [127]:

df["grade"]

Out[127]:

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

In [128]:

df.sort_values(by="grade")

Out[128]:

id raw_grade grade
5 6 e very bad
1 2 b good
2 3 b good
0 1 a very good
3 4 a very good
4 5 a very good

In [129]:

df.groupby("grade").size()

Out[129]:

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

In [130]:

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

In [131]:

ts = ts.cumsum()

In [132]:

ts.plot()

Out[132]:

<matplotlib.axes._subplots.AxesSubplot at 0x8a81b00>

In [133]:

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])

In [134]:

df = df.cumsum()

In [135]:

plt.figure(); df.plot(); plt.legend(loc='best')

Out[135]:

<matplotlib.legend.Legend at 0x8cebf98>

In [136]:

df.to_csv('foo.csv')

In [137]:

pd.read_csv('foo.csv')

Out[137]:

Unnamed: 0 A B C D
0 2000-01-01 -0.030936 0.895485 0.860234 0.436905
1 2000-01-02 0.462136 1.529305 1.465795 0.275320
2 2000-01-03 1.347457 2.146237 1.025922 1.807516
3 2000-01-04 1.762626 1.617238 2.269617 2.750168
4 2000-01-05 0.752935 -0.573293 2.496633 2.555544
5 2000-01-06 0.559531 -1.075933 3.604460 2.045109
6 2000-01-07 2.864227 2.116665 4.646330 0.552505
7 2000-01-08 2.790815 1.112905 6.876355 0.842887
8 2000-01-09 2.781517 -0.038389 7.562739 -0.677093
9 2000-01-10 3.277815 0.783239 8.625772 -1.210071
10 2000-01-11 2.671833 1.628914 9.013839 -0.878836
11 2000-01-12 3.358392 1.286629 7.869067 0.927850
12 2000-01-13 2.934755 1.281981 8.026394 0.944987
13 2000-01-14 3.855528 1.044585 7.611520 -0.030036
14 2000-01-15 3.197158 1.342844 7.764532 0.003429
15 2000-01-16 3.397053 -0.159531 8.506764 0.788280
16 2000-01-17 2.604527 0.237801 9.856556 1.040767
17 2000-01-18 5.084828 0.499864 9.658846 2.005989
18 2000-01-19 4.106095 -1.350089 9.143522 1.686732
19 2000-01-20 4.175043 -1.190392 8.058163 0.787240
20 2000-01-21 5.185731 -0.777054 8.812265 0.419670
21 2000-01-22 4.371896 0.233903 8.661171 0.965343
22 2000-01-23 4.828027 -1.879995 9.411272 0.913708
23 2000-01-24 5.439256 -1.489195 9.325441 -0.105687
24 2000-01-25 4.692312 -2.057943 11.866773 0.281276
25 2000-01-26 3.076197 -1.366478 13.091266 1.299899
26 2000-01-27 3.061897 -1.074740 12.495540 2.413354
27 2000-01-28 3.331420 -1.146924 12.470048 4.507037
28 2000-01-29 2.539132 -2.027100 13.117951 4.890443
29 2000-01-30 3.063721 -0.814003 13.669964 4.517990
... ... ... ... ... ...
970 2002-08-28 0.284504 -22.271339 15.981331 15.047098
971 2002-08-29 1.179390 -21.201687 15.868212 14.459094
972 2002-08-30 -1.064233 -22.113170 15.675313 14.426811
973 2002-08-31 -0.204647 -21.770136 16.291319 16.018532
974 2002-09-01 1.565074 -22.972256 16.085625 17.164091
975 2002-09-02 1.594709 -22.544796 15.194863 16.747902
976 2002-09-03 2.058587 -23.616482 16.108197 15.722705
977 2002-09-04 2.204938 -24.498805 17.115751 16.182108
978 2002-09-05 2.544514 -25.655447 17.757754 13.716535
979 2002-09-06 0.042641 -26.231512 16.933801 14.481075
980 2002-09-07 0.151732 -23.564705 18.606103 15.288614
981 2002-09-08 -1.103293 -22.145037 19.017814 14.003808
982 2002-09-09 -0.886503 -23.226622 18.262366 14.134913
983 2002-09-10 -0.460123 -24.301707 18.189121 12.835480
984 2002-09-11 -0.828844 -25.875458 19.843507 12.759591
985 2002-09-12 -0.517894 -25.686486 21.261322 11.394933
986 2002-09-13 0.184588 -25.469125 21.141841 11.325127
987 2002-09-14 1.841196 -23.770084 21.204105 11.307507
988 2002-09-15 3.223569 -23.631714 20.283344 11.293488
989 2002-09-16 3.391174 -22.155609 19.469558 11.086509
990 2002-09-17 2.899235 -21.962757 20.295803 10.123460
991 2002-09-18 1.712088 -20.568953 21.282493 10.251396
992 2002-09-19 3.281950 -21.733617 20.914229 8.739577
993 2002-09-20 2.324456 -21.464629 20.747934 8.104013
994 2002-09-21 3.534397 -21.065582 18.900106 9.929498
995 2002-09-22 2.547043 -21.121919 19.291717 9.657817
996 2002-09-23 3.967868 -20.993894 19.146004 8.970358
997 2002-09-24 3.223028 -21.249086 19.858325 9.268188
998 2002-09-25 3.895291 -21.572320 21.571366 10.031764
999 2002-09-26 3.295146 -20.708933 22.339290 10.840932

1000 rows × 5 columns

In [138]:

df.to_hdf('foo.h5','df')

In [139]:

pd.read_hdf('foo.h5','df')

Out[139]:

A B C D
2000-01-01 -0.030936 0.895485 0.860234 0.436905
2000-01-02 0.462136 1.529305 1.465795 0.275320
2000-01-03 1.347457 2.146237 1.025922 1.807516
2000-01-04 1.762626 1.617238 2.269617 2.750168
2000-01-05 0.752935 -0.573293 2.496633 2.555544
2000-01-06 0.559531 -1.075933 3.604460 2.045109
2000-01-07 2.864227 2.116665 4.646330 0.552505
2000-01-08 2.790815 1.112905 6.876355 0.842887
2000-01-09 2.781517 -0.038389 7.562739 -0.677093
2000-01-10 3.277815 0.783239 8.625772 -1.210071
2000-01-11 2.671833 1.628914 9.013839 -0.878836
2000-01-12 3.358392 1.286629 7.869067 0.927850
2000-01-13 2.934755 1.281981 8.026394 0.944987
2000-01-14 3.855528 1.044585 7.611520 -0.030036
2000-01-15 3.197158 1.342844 7.764532 0.003429
2000-01-16 3.397053 -0.159531 8.506764 0.788280
2000-01-17 2.604527 0.237801 9.856556 1.040767
2000-01-18 5.084828 0.499864 9.658846 2.005989
2000-01-19 4.106095 -1.350089 9.143522 1.686732
2000-01-20 4.175043 -1.190392 8.058163 0.787240
2000-01-21 5.185731 -0.777054 8.812265 0.419670
2000-01-22 4.371896 0.233903 8.661171 0.965343
2000-01-23 4.828027 -1.879995 9.411272 0.913708
2000-01-24 5.439256 -1.489195 9.325441 -0.105687
2000-01-25 4.692312 -2.057943 11.866773 0.281276
2000-01-26 3.076197 -1.366478 13.091266 1.299899
2000-01-27 3.061897 -1.074740 12.495540 2.413354
2000-01-28 3.331420 -1.146924 12.470048 4.507037
2000-01-29 2.539132 -2.027100 13.117951 4.890443
2000-01-30 3.063721 -0.814003 13.669964 4.517990
... ... ... ... ...
2002-08-28 0.284504 -22.271339 15.981331 15.047098
2002-08-29 1.179390 -21.201687 15.868212 14.459094
2002-08-30 -1.064233 -22.113170 15.675313 14.426811
2002-08-31 -0.204647 -21.770136 16.291319 16.018532
2002-09-01 1.565074 -22.972256 16.085625 17.164091
2002-09-02 1.594709 -22.544796 15.194863 16.747902
2002-09-03 2.058587 -23.616482 16.108197 15.722705
2002-09-04 2.204938 -24.498805 17.115751 16.182108
2002-09-05 2.544514 -25.655447 17.757754 13.716535
2002-09-06 0.042641 -26.231512 16.933801 14.481075
2002-09-07 0.151732 -23.564705 18.606103 15.288614
2002-09-08 -1.103293 -22.145037 19.017814 14.003808
2002-09-09 -0.886503 -23.226622 18.262366 14.134913
2002-09-10 -0.460123 -24.301707 18.189121 12.835480
2002-09-11 -0.828844 -25.875458 19.843507 12.759591
2002-09-12 -0.517894 -25.686486 21.261322 11.394933
2002-09-13 0.184588 -25.469125 21.141841 11.325127
2002-09-14 1.841196 -23.770084 21.204105 11.307507
2002-09-15 3.223569 -23.631714 20.283344 11.293488
2002-09-16 3.391174 -22.155609 19.469558 11.086509
2002-09-17 2.899235 -21.962757 20.295803 10.123460
2002-09-18 1.712088 -20.568953 21.282493 10.251396
2002-09-19 3.281950 -21.733617 20.914229 8.739577
2002-09-20 2.324456 -21.464629 20.747934 8.104013
2002-09-21 3.534397 -21.065582 18.900106 9.929498
2002-09-22 2.547043 -21.121919 19.291717 9.657817
2002-09-23 3.967868 -20.993894 19.146004 8.970358
2002-09-24 3.223028 -21.249086 19.858325 9.268188
2002-09-25 3.895291 -21.572320 21.571366 10.031764
2002-09-26 3.295146 -20.708933 22.339290 10.840932

1000 rows × 4 columns

In [140]:

df.to_excel('foo.xlsx', sheet_name='Sheet1')

In [141]:

pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

Out[141]:

A B C D
2000-01-01 -0.030936 0.895485 0.860234 0.436905
2000-01-02 0.462136 1.529305 1.465795 0.275320
2000-01-03 1.347457 2.146237 1.025922 1.807516
2000-01-04 1.762626 1.617238 2.269617 2.750168
2000-01-05 0.752935 -0.573293 2.496633 2.555544
2000-01-06 0.559531 -1.075933 3.604460 2.045109
2000-01-07 2.864227 2.116665 4.646330 0.552505
2000-01-08 2.790815 1.112905 6.876355 0.842887
2000-01-09 2.781517 -0.038389 7.562739 -0.677093
2000-01-10 3.277815 0.783239 8.625772 -1.210071
2000-01-11 2.671833 1.628914 9.013839 -0.878836
2000-01-12 3.358392 1.286629 7.869067 0.927850
2000-01-13 2.934755 1.281981 8.026394 0.944987
2000-01-14 3.855528 1.044585 7.611520 -0.030036
2000-01-15 3.197158 1.342844 7.764532 0.003429
2000-01-16 3.397053 -0.159531 8.506764 0.788280
2000-01-17 2.604527 0.237801 9.856556 1.040767
2000-01-18 5.084828 0.499864 9.658846 2.005989
2000-01-19 4.106095 -1.350089 9.143522 1.686732
2000-01-20 4.175043 -1.190392 8.058163 0.787240
2000-01-21 5.185731 -0.777054 8.812265 0.419670
2000-01-22 4.371896 0.233903 8.661171 0.965343
2000-01-23 4.828027 -1.879995 9.411272 0.913708
2000-01-24 5.439256 -1.489195 9.325441 -0.105687
2000-01-25 4.692312 -2.057943 11.866773 0.281276
2000-01-26 3.076197 -1.366478 13.091266 1.299899
2000-01-27 3.061897 -1.074740 12.495540 2.413354
2000-01-28 3.331420 -1.146924 12.470048 4.507037
2000-01-29 2.539132 -2.027100 13.117951 4.890443
2000-01-30 3.063721 -0.814003 13.669964 4.517990
... ... ... ... ...
2002-08-28 0.284504 -22.271339 15.981331 15.047098
2002-08-29 1.179390 -21.201687 15.868212 14.459094
2002-08-30 -1.064233 -22.113170 15.675313 14.426811
2002-08-31 -0.204647 -21.770136 16.291319 16.018532
2002-09-01 1.565074 -22.972256 16.085625 17.164091
2002-09-02 1.594709 -22.544796 15.194863 16.747902
2002-09-03 2.058587 -23.616482 16.108197 15.722705
2002-09-04 2.204938 -24.498805 17.115751 16.182108
2002-09-05 2.544514 -25.655447 17.757754 13.716535
2002-09-06 0.042641 -26.231512 16.933801 14.481075
2002-09-07 0.151732 -23.564705 18.606103 15.288614
2002-09-08 -1.103293 -22.145037 19.017814 14.003808
2002-09-09 -0.886503 -23.226622 18.262366 14.134913
2002-09-10 -0.460123 -24.301707 18.189121 12.835480
2002-09-11 -0.828844 -25.875458 19.843507 12.759591
2002-09-12 -0.517894 -25.686486 21.261322 11.394933
2002-09-13 0.184588 -25.469125 21.141841 11.325127
2002-09-14 1.841196 -23.770084 21.204105 11.307507
2002-09-15 3.223569 -23.631714 20.283344 11.293488
2002-09-16 3.391174 -22.155609 19.469558 11.086509
2002-09-17 2.899235 -21.962757 20.295803 10.123460
2002-09-18 1.712088 -20.568953 21.282493 10.251396
2002-09-19 3.281950 -21.733617 20.914229 8.739577
2002-09-20 2.324456 -21.464629 20.747934 8.104013
2002-09-21 3.534397 -21.065582 18.900106 9.929498
2002-09-22 2.547043 -21.121919 19.291717 9.657817
2002-09-23 3.967868 -20.993894 19.146004 8.970358
2002-09-24 3.223028 -21.249086 19.858325 9.268188
2002-09-25 3.895291 -21.572320 21.571366 10.031764
2002-09-26 3.295146 -20.708933 22.339290 10.840932

转载请注明: 三木先生 » 使用Pandas总结

喜欢 (2) or 分享 ( 0)

您的支持是博主写作最大的动力,如果您喜欢我的文章,感觉我的文章对您有帮助,请狠狠点击下面的

发表我的评论

表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址
(0)个小伙伴在吐槽