%matplotlib inline

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

s = pd.Series([1,3,5,np.nan,6,8])

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

dates = pd.date_range('20130101', periods=6)

dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

df

df2 = pd.DataFrame({'A' : 1., 
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'})

df2

df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

df.head()

df.tail(3)

df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

df.columns

Index([u'A', u'B', u'C', u'D'], dtype='object')

df.values

array([[-1.23454325, -0.18109375,  0.60276697,  0.02409219],
       [-1.01234439, -1.74766336, -0.44616522, -0.30827222],
       [ 0.10868385, -0.69860582,  0.29422346,  0.77993654],
       [ 0.87005021, -0.16265896, -1.84247599, -0.82842427],
       [ 1.58629419, -0.36956975,  0.88773803, -0.06712956],
       [ 0.60391916, -0.29284674, -0.35193192, -0.54622645]])

df.describe()

df.T

df.sort_index(axis=1, ascending=False)

df.sort_values(by='B')

df['A']

2013-01-01   -1.234543
2013-01-02   -1.012344
2013-01-03    0.108684
2013-01-04    0.870050
2013-01-05    1.586294
2013-01-06    0.603919
Freq: D, Name: A, dtype: float64

df[0:3]

df['20130102':'20130104']

df.loc[dates[0]]

A   -1.234543
B   -0.181094
C    0.602767
D    0.024092
Name: 2013-01-01 00:00:00, dtype: float64

df.loc[:,['A','B']]

df.loc['20130102':'20130104',['A','B']]

df.loc['20130102',['A','B']]

A   -1.012344
B   -1.747663
Name: 2013-01-02 00:00:00, dtype: float64

df.loc[dates[0],'A']

-1.2345432480819052

df.at[dates[0],'A']

-1.2345432480819052

df.iloc[3]

A    0.870050
B   -0.162659
C   -1.842476
D   -0.828424
Name: 2013-01-04 00:00:00, dtype: float64

df.iloc[3:5,0:2]

df.iloc[[1,2,4],[0,2]]

df.iloc[1:3,:]

df.iloc[:,1:3]

df.iloc[1,1]

-1.7476633559261565

df.iat[1,1]

-1.7476633559261565

df[df.A>0]

df[df > 0]

df2 = df.copy()

df2['E'] = ['one','one','two','three','four','three']

df2

df2[df2['E'].isin(['two','four'])]

s1 = pd.Series([1,2,3,4,5,6], index=pd.date_range('20130102', periods=6))

s1

2013-01-02    1
2013-01-03    2
2013-01-04    3
2013-01-05    4
2013-01-06    5
2013-01-07    6
Freq: D, dtype: int64

df['F'] = s1

df.at[dates[0],'A'] = 0

df.iat[0,1] = 0

df.loc[:,'D'] = np.array([5] * len(df))

df

df2 = df.copy()

df2[df2 > 0] = -df2

df2

df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E'])

df1.loc[dates[0]:dates[1],'E'] = 1

df1

df1.dropna(how='any')

df1.fillna(value=5)

pd.isnull(df1)

df.mean()

A    0.359434
B   -0.545224
C   -0.142641
D    5.000000
F    3.000000
dtype: float64

df.mean(1)

2013-01-01    1.400692
2013-01-02    0.558765
2013-01-03    1.340860
2013-01-04    1.372983
2013-01-05    2.220892
2013-01-06    1.991828
Freq: D, dtype: float64

s = pd.Series([1,3,5,np.nan,6,8], index=dates).shift(2)

s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

df.sub(s, axis='index')

df.apply(np.cumsum)

df.apply(lambda x: x.max() - x.min())

A    2.598639
B    1.747663
C    2.730214
D    0.000000
F    4.000000
dtype: float64

s = pd.Series(np.random.randint(0, 7, size=10))

s

0    1
1    3
2    1
3    6
4    6
5    6
6    2
7    6
8    1
9    2
dtype: int32

s.value_counts()

6    4
1    3
2    2
3    1
dtype: int64

s = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca', np.nan, 'CABA', 'dog', 'cat'])

s.str.lower()

0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

df = pd.DataFrame(np.random.randn(10, 4))

df

pieces = [df[:3], df[3:7], df[7:]]

pd.concat(pieces)

left = pd.DataFrame({'key': ['foo', 'foo'], 'lval': [1, 2]})

right = pd.DataFrame({'key': ['foo', 'foo'], 'rval': [4, 5]})

left

right

pd.merge(left, right, on='key')

df = pd.DataFrame(np.random.randn(8, 4), columns=['A','B','C','D'])

df

s = df.iloc[3]

df.append(s, ignore_index=True)

df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
                          'foo', 'bar', 'foo', 'foo'],
                   'B' : ['one', 'one', 'two', 'three',
                           'two', 'two', 'one', 'three'],
                   'C' : np.random.randn(8),
                   'D' : np.random.randn(8)})

df

df.groupby('A').sum()

df.groupby(['A','B']).sum()

tuples = list(zip(*[['bar', 'bar', 'baz', 'baz', 
                     'foo', 'foo', 'qux', 'qux'],
                    ['one', 'two', 'one', 'two',
                      'one', 'two', 'one', 'two']]))

index = pd.MultiIndex.from_tuples(tuples, names=['first', 'second'])

df = pd.DataFrame(np.random.randn(8, 2), index=index, columns=['A', 'B'])

df2 = df[:4]

df2

stacked = df2.stack()

stacked

first  second   
bar    one     A   -0.723323
               B    0.071984
       two     A   -1.068435
               B   -0.485561
baz    one     A    0.018726
               B   -0.379056
       two     A    2.051579
               B   -0.789019
dtype: float64

stacked.unstack()

stacked.unstack(1)

stacked.unstack(0)

df = pd.DataFrame({'A' : ['one', 'one', 'two', 'three'] * 3,
                   'B' : ['A', 'B', 'C'] * 4,
                   'C' : ['foo', 'foo', 'foo', 'bar', 'bar', 'bar'] * 2,
                   'D' : np.random.randn(12),
                   'E' : np.random.randn(12)})

df

pd.pivot_table(df, values='D', index=['A', 'B'], columns=['C'])

rng = pd.date_range('1/1/2012', periods=100, freq='S')

ts = pd.Series(np.random.randint(0, 500, len(rng)), index=rng)

ts.resample('5Min').sum()

2012-01-01    26177
Freq: 5T, dtype: int32

rng = pd.date_range('3/6/2012 00:00', periods=5, freq='D')

ts = pd.Series(np.random.randn(len(rng)), rng)

ts

2012-03-06    1.964786
2012-03-07   -0.086269
2012-03-08    0.017538
2012-03-09   -0.128088
2012-03-10   -0.657546
Freq: D, dtype: float64

ts_utc = ts.tz_localize('UTC')

ts_utc

2012-03-06 00:00:00+00:00    1.964786
2012-03-07 00:00:00+00:00   -0.086269
2012-03-08 00:00:00+00:00    0.017538
2012-03-09 00:00:00+00:00   -0.128088
2012-03-10 00:00:00+00:00   -0.657546
Freq: D, dtype: float64

ts_utc.tz_convert('US/Eastern')

2012-03-05 19:00:00-05:00    1.964786
2012-03-06 19:00:00-05:00   -0.086269
2012-03-07 19:00:00-05:00    0.017538
2012-03-08 19:00:00-05:00   -0.128088
2012-03-09 19:00:00-05:00   -0.657546
Freq: D, dtype: float64

rng = pd.date_range('1/1/2012', periods=5, freq='M')

ts = pd.Series(np.random.randn(len(rng)), index=rng)

ts

2012-01-31    0.558833
2012-02-29   -0.213529
2012-03-31   -0.934557
2012-04-30   -1.941257
2012-05-31   -0.705337
Freq: M, dtype: float64

ps = ts.to_period()

ps

2012-01    0.558833
2012-02   -0.213529
2012-03   -0.934557
2012-04   -1.941257
2012-05   -0.705337
Freq: M, dtype: float64

ps.to_timestamp()

2012-01-01    0.558833
2012-02-01   -0.213529
2012-03-01   -0.934557
2012-04-01   -1.941257
2012-05-01   -0.705337
Freq: MS, dtype: float64

prng = pd.period_range('1990Q1', '2000Q4', freq='Q-NOV')

ts = pd.Series(np.random.randn(len(prng)), prng)

ts.index = (prng.asfreq('M', 'e') + 1).asfreq('H', 's') + 9

ts.head()

1990-03-01 09:00    0.612225
1990-06-01 09:00    0.125501
1990-09-01 09:00   -0.647144
1990-12-01 09:00    0.858475
1991-03-01 09:00    0.408605
Freq: H, dtype: float64

df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})

df["grade"] = df["raw_grade"].astype("category")

df["grade"]

0    a
1    b
2    b
3    a
4    a
5    e
Name: grade, dtype: category
Categories (3, object): [a, b, e]

df["grade"].cat.categories = ["very good", "good", "very bad"]

df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"])

df["grade"]

0    very good
1         good
2         good
3    very good
4    very good
5     very bad
Name: grade, dtype: category
Categories (5, object): [very bad, bad, medium, good, very good]

df.sort_values(by="grade")

df.groupby("grade").size()

grade
very bad     1
bad          0
medium       0
good         2
very good    3
dtype: int64

ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))

ts = ts.cumsum()

ts.plot()

<matplotlib.axes._subplots.AxesSubplot at 0x8a81b00>

df = pd.DataFrame(np.random.randn(1000, 4), index=ts.index,
                  columns=['A', 'B', 'C', 'D'])

df = df.cumsum()

plt.figure(); df.plot(); plt.legend(loc='best')

<matplotlib.legend.Legend at 0x8cebf98>

<matplotlib.figure.Figure at 0x8d0d208>

df.to_csv('foo.csv')

pd.read_csv('foo.csv')

df.to_hdf('foo.h5','df')

pd.read_hdf('foo.h5','df')

df.to_excel('foo.xlsx', sheet_name='Sheet1')

pd.read_excel('foo.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

	A	B	C	D
count	6.000000	6.000000	6.000000	6.000000
mean	0.153677	-0.575406	-0.142641	-0.157671
std	1.100644	0.606204	0.983212	0.555681
min	-1.234543	-1.747663	-1.842476	-0.828424
25%	-0.732087	-0.616347	-0.422607	-0.486738
50%	0.356302	-0.331208	-0.028854	-0.187701
75%	0.803517	-0.209032	0.525631	0.001287
max	1.586294	-0.162659	0.887738	0.779937

	A	B	C	D	F
2013-01-01	0.000000	0.000000	-0.602767	-5	NaN
2013-01-02	-1.012344	-1.747663	-0.446165	-5	-1.0
2013-01-03	-0.108684	-0.698606	-0.294223	-5	-2.0
2013-01-04	-0.870050	-0.162659	-1.842476	-5	-3.0
2013-01-05	-1.586294	-0.369570	-0.887738	-5	-4.0
2013-01-06	-0.603919	-0.292847	-0.351932	-5	-5.0

	0	1	2	3
0	0.087190	0.378057	1.230033	-1.916034
1	1.845394	-0.493289	1.341151	-0.312285
2	-1.414530	0.567152	-1.266626	-0.824010
3	0.085407	1.743999	1.059148	0.967338
4	1.968624	-0.456669	-1.170885	0.144196
5	-0.095139	-0.307934	1.317358	-0.744105
6	-1.557237	-1.185375	0.840714	1.070566
7	0.049833	-0.967053	1.201201	-0.469980
8	0.782234	0.027085	0.573083	1.116846
9	1.293508	1.188724	-1.044702	-0.973416

	0	1	2	3
0	0.087190	0.378057	1.230033	-1.916034
1	1.845394	-0.493289	1.341151	-0.312285
2	-1.414530	0.567152	-1.266626	-0.824010
3	0.085407	1.743999	1.059148	0.967338
4	1.968624	-0.456669	-1.170885	0.144196
5	-0.095139	-0.307934	1.317358	-0.744105
6	-1.557237	-1.185375	0.840714	1.070566
7	0.049833	-0.967053	1.201201	-0.469980
8	0.782234	0.027085	0.573083	1.116846
9	1.293508	1.188724	-1.044702	-0.973416

	A	B	C	D
0	-1.137633	1.190761	-1.589958	0.274725
1	-1.182860	-0.964490	0.701884	-0.209471
2	1.708852	-0.926927	-1.044458	-1.501952
3	0.956593	0.605129	0.929932	0.382138
4	0.443152	0.795685	-1.092606	0.384814
5	0.570427	0.134011	0.425411	0.672966
6	0.557559	-1.400583	1.328377	-1.237753
7	2.263220	0.883696	1.772972	-1.388512

	A	B	C	D
2013-01-01	-1.234543	-0.181094	0.602767	0.024092
2013-01-02	-1.012344	-1.747663	-0.446165	-0.308272
2013-01-03	0.108684	-0.698606	0.294223	0.779937
2013-01-04	0.870050	-0.162659	-1.842476	-0.828424
2013-01-05	1.586294	-0.369570	0.887738	-0.067130
2013-01-06	0.603919	-0.292847	-0.351932	-0.546226

	A	B	C	D	E	F
0	1.0	2013-01-02	1.0	3	test	foo
1	1.0	2013-01-02	1.0	3	train	foo
2	1.0	2013-01-02	1.0	3	test	foo
3	1.0	2013-01-02	1.0	3	train	foo

	A	B	C	D	F	E
2013-01-01	False	False	False	False	True	False
2013-01-02	False	False	False	False	False	False
2013-01-03	False	False	False	False	False	True
2013-01-04	False	False	False	False	False	True

	A	B	C	D
0	foo	one	0.135346	-0.916001
1	bar	one	-0.907362	0.550130
2	foo	two	-1.758074	0.264998
3	bar	three	0.153849	0.566296
4	foo	two	-1.033947	0.551321
5	bar	two	0.494596	-0.859861
6	foo	one	0.347631	0.106605
7	foo	three	0.785117	-0.138477

		A	B
first	second
bar	one	-0.723323	0.071984
bar	two	-1.068435	-0.485561
baz	one	0.018726	-0.379056
baz	two	2.051579	-0.789019

	A	B	C	D	E
0	one	A	foo	-0.435141	-0.012559
1	one	B	foo	1.419821	1.676063
2	two	C	foo	-0.844238	-0.248897
3	three	A	bar	0.291280	1.768196
4	one	B	bar	-0.559366	-1.223321
5	one	C	bar	-1.111149	0.599756
6	two	A	foo	-0.980251	0.168256
7	three	B	foo	0.596242	-0.515708
8	one	C	foo	-0.445593	0.528068
9	one	A	bar	0.344460	-0.199001
10	two	B	bar	-1.455970	-0.209612
11	three	C	bar	0.406563	0.340673

	id	raw_grade	grade
5	6	e	very bad
1	2	b	good
2	3	b	good
0	1	a	very good
3	4	a	very good
4	5	a	very good

	Unnamed: 0	A	B	C	D
0	2000-01-01	-0.030936	0.895485	0.860234	0.436905
1	2000-01-02	0.462136	1.529305	1.465795	0.275320
2	2000-01-03	1.347457	2.146237	1.025922	1.807516
3	2000-01-04	1.762626	1.617238	2.269617	2.750168
4	2000-01-05	0.752935	-0.573293	2.496633	2.555544
5	2000-01-06	0.559531	-1.075933	3.604460	2.045109
6	2000-01-07	2.864227	2.116665	4.646330	0.552505
7	2000-01-08	2.790815	1.112905	6.876355	0.842887
8	2000-01-09	2.781517	-0.038389	7.562739	-0.677093
9	2000-01-10	3.277815	0.783239	8.625772	-1.210071
10	2000-01-11	2.671833	1.628914	9.013839	-0.878836
11	2000-01-12	3.358392	1.286629	7.869067	0.927850
12	2000-01-13	2.934755	1.281981	8.026394	0.944987
13	2000-01-14	3.855528	1.044585	7.611520	-0.030036
14	2000-01-15	3.197158	1.342844	7.764532	0.003429
15	2000-01-16	3.397053	-0.159531	8.506764	0.788280
16	2000-01-17	2.604527	0.237801	9.856556	1.040767
17	2000-01-18	5.084828	0.499864	9.658846	2.005989
18	2000-01-19	4.106095	-1.350089	9.143522	1.686732
19	2000-01-20	4.175043	-1.190392	8.058163	0.787240
20	2000-01-21	5.185731	-0.777054	8.812265	0.419670
21	2000-01-22	4.371896	0.233903	8.661171	0.965343
22	2000-01-23	4.828027	-1.879995	9.411272	0.913708
23	2000-01-24	5.439256	-1.489195	9.325441	-0.105687
24	2000-01-25	4.692312	-2.057943	11.866773	0.281276
25	2000-01-26	3.076197	-1.366478	13.091266	1.299899
26	2000-01-27	3.061897	-1.074740	12.495540	2.413354
27	2000-01-28	3.331420	-1.146924	12.470048	4.507037
28	2000-01-29	2.539132	-2.027100	13.117951	4.890443
29	2000-01-30	3.063721	-0.814003	13.669964	4.517990
...	...	...	...	...	...
970	2002-08-28	0.284504	-22.271339	15.981331	15.047098
971	2002-08-29	1.179390	-21.201687	15.868212	14.459094
972	2002-08-30	-1.064233	-22.113170	15.675313	14.426811
973	2002-08-31	-0.204647	-21.770136	16.291319	16.018532
974	2002-09-01	1.565074	-22.972256	16.085625	17.164091
975	2002-09-02	1.594709	-22.544796	15.194863	16.747902
976	2002-09-03	2.058587	-23.616482	16.108197	15.722705
977	2002-09-04	2.204938	-24.498805	17.115751	16.182108
978	2002-09-05	2.544514	-25.655447	17.757754	13.716535
979	2002-09-06	0.042641	-26.231512	16.933801	14.481075
980	2002-09-07	0.151732	-23.564705	18.606103	15.288614
981	2002-09-08	-1.103293	-22.145037	19.017814	14.003808
982	2002-09-09	-0.886503	-23.226622	18.262366	14.134913
983	2002-09-10	-0.460123	-24.301707	18.189121	12.835480
984	2002-09-11	-0.828844	-25.875458	19.843507	12.759591
985	2002-09-12	-0.517894	-25.686486	21.261322	11.394933
986	2002-09-13	0.184588	-25.469125	21.141841	11.325127
987	2002-09-14	1.841196	-23.770084	21.204105	11.307507
988	2002-09-15	3.223569	-23.631714	20.283344	11.293488
989	2002-09-16	3.391174	-22.155609	19.469558	11.086509
990	2002-09-17	2.899235	-21.962757	20.295803	10.123460
991	2002-09-18	1.712088	-20.568953	21.282493	10.251396
992	2002-09-19	3.281950	-21.733617	20.914229	8.739577
993	2002-09-20	2.324456	-21.464629	20.747934	8.104013
994	2002-09-21	3.534397	-21.065582	18.900106	9.929498
995	2002-09-22	2.547043	-21.121919	19.291717	9.657817
996	2002-09-23	3.967868	-20.993894	19.146004	8.970358
997	2002-09-24	3.223028	-21.249086	19.858325	9.268188
998	2002-09-25	3.895291	-21.572320	21.571366	10.031764
999	2002-09-26	3.295146	-20.708933	22.339290	10.840932

	A	B	C	D	F
2013-01-01	0.000000	0.000000	0.602767	5	NaN
2013-01-02	-1.012344	-1.747663	0.156602	10	1.0
2013-01-03	-0.903661	-2.446269	0.450825	15	3.0
2013-01-04	-0.033610	-2.608928	-1.391651	20	6.0
2013-01-05	1.552684	-2.978498	-0.503913	25	10.0
2013-01-06	2.156603	-3.271345	-0.855845	30	15.0

	key	lval
0	foo	1
1	foo	2

	key	rval
0	foo	4
1	foo	5

	C	D
A
bar	-0.258916	0.256565
foo	-1.523927	-0.131554