Pandas Tricks
Pandas Tricks
Avoid Int to Float
when df.T.to_dict().values()
Instead of use list(df.T.to_dict().values())
, use [{c:getattr(r, c) for c in df} for r in df.itertuples()]
import pandas as pd
df = pd.DataFrame({'a':[3,2], 'b':[42.0,3.14]})
result = [{col:getattr(row, col) for col in df} for row in df.itertuples()]
print(result)
# [{'a': 3, 'b': 42.0}, {'a': 2, 'b': 3.1400000000000001}]
Read Excel
xl = pd.ExcelFile(fn)
# read Sheets as DataFrames
NaN_List = ['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan'] # Nashville term is NA
df_Node = xl.parse("Node", converters={'milepost': str}, keep_default_na=False, na_values=NaN_List)
df_Segment = xl.parse("Segment", converters={'startNode': str, 'endNode': str}, keep_default_na=False, na_values=NaN_List)
Set display format
import pandas as pd
pd.set_option('display.width',200)
pd.set_option('display.max_rows',20)
pd.set_option('display.max_columns', 15)
pd.set_option('display.precision', 3)
Iterate DataFrame by rows
for i,r in df.iterrows():
if pd.isnull(r['OSRM_meter']):
url = createURL(r)
New calculated column
df.loc[:, 'diff'] = df.apply(lambda r: r['c1'] - r['c2'], axis=1)
Filter by conditions
cond1 = df['Project'].map(lambda x: x in [1,2,3])
cond2 = df['type'].map(lambda x: x == 'TODO')
df[cond1 & cond2]['SQL']
Get row index of filtered dataset
cond1 = df['entry_ts'].map(lambda x: x <= aTime)
cond2 = df['next_entry_ts'].map(lambda x: x > aTime)
s = df[cond1 & cond2]
idx = s.index.tolist()
Add row index as new column
g.loc[:,'seq'] = pd.Seies([i+1 for i in range(len(g))], index=g.index)
Select rows whose field value not in list / dict / Series
aList = 'EGKNTUVW'
df = df[~df.t1.isin([c for c in aList])]
Read from Excel
xl = pd.ExcelFile(fn)
df = xl.parse('sheet1')
Read from CSV
df = pd.read_csv(fn, index_col=false)
Parse datetime while reading from CSV
parseFunc = lambda x: pd.datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
df = pd.read_csv(fn, parse_dates=['startDate'], date_parser=parseFunc)
Read DF from Stored Procedure
import pandas as pd
import pandas.io.sql as psql
import pyodbc
connStr ='DRIVER={};SERVER={};DATABASE={};UID={};PWD={}'.format(
'{SQL Server}',
'MS_SQLServer_name',
'DB_name',
'username',
'password'
)
conn = pyodbc.connect(connStr)
# SP with no parameters
df = psql.read_sql("{call p_GetOPRP_TP_05}", conn)
# SP with parameters
df = psql.read_sql("{{call LOR_SIM.dbo.p_getTM2({0}, '{1}')}}".format(
param.SCENARIO_ID,
subdiv
), conn)
Groupby(), get_group(), get group size
grp = df.groupby(['TRAIN_ID'])
g = grp.get_group('Q123')
sizeDict = grp.size().to_dict()
Iterate group and sort by columns
for tid in df_tm6_grp.groups.keys():
g = df_tm6_grp.get_group(tid)
g = g.sort_values(by=['ON_STATION_TS'], ascending=[True])
df.sort_values(by=['trainID', 'stationSeq'], ascending=[True, True], inplace=True)
Groupby aggregate
- Group by two columns (RouteID and Date), then apply
sum
function on Volume column
tmp = df[['RouteID', 'Date', 'Volume']].groupby(['RouteID', 'Date']).agg({'Volume':['sum']}).reset_index(inplace=False)
Note: Using method below throws warning so should avoid
FutureWarning: using a dict with renaming is deprecated and will be removed in a future version return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)
tmp = df.groupby(['start_MP', 'end_MP']).agg({
'runTime': {
'count_all': 'count', # count
'count_distinct': 'nunique',
'mean': 'mean',
'median': 'median',
'std': 'std',
'min': 'min',
'max': 'max',
}
})
Groupby, Aggregate, then to_dict
tmp = df[['RouteID', 'Date', 'V']].groupby(['RouteID', 'Date']).agg({'V':['sum']}).to_dict(orient='index')
Rename agg columns
grp = df.groupby(['DC_NBR', 'QUAD_ID']).agg({
'ASSOCIATE_ID': {'count_': 'nunique'}, # distinct
'LANE_ID': {'count_': 'count'},
'VOLUME_PCT': {'sum_': 'sum'},
}).reset_index(inplace=False)
grp.columns = [''.join(reversed(col)).strip() for col in grp.columns.values]
Pivot Table / Calculate multiple statistics of the same column
pt = df.groupby(['t4'])[['runDuration']].agg([
pd.Series.mean,
pd.Series.std,
pd.Series.count,
], as_index=False).reset_index(inplace=False)
# Flatten name
pt.columns = [''.join(col).strip() for col in pt.columns.values]
print(pt)
Join two DataFrame
df = df.merge(df1, on=['id','type'], how='left')
Rename columns
df = df.rename(columns={
'latitude': 'lat',
'longitude': 'lon',
'before': 'after',
})
Concat two/more DataFrames
df = pd.concat([df1, df2, df3]) # df1, df2, df3 must have same columns
Convert multi-index into column
df.reset_index(inplace=True)
DataFrame to list of dicts
REF: https://stackoverflow.com/questions/29815129/pandas-dataframe-to-list-of-dictionaries
Use
df.T.to_dict().values()
ordf.to_dict('records')
(faster) ```python In [1]: df Out[1]: customer item1 item2 item3 0 1 apple milk tomato 1 2 water orange potato 2 3 juice mango chips
In [2]: df.T.to_dict().values() Out[2]: [{‘customer’: 1.0, ‘item1’: ‘apple’, ‘item2’: ‘milk’, ‘item3’: ‘tomato’}, {‘customer’: 2.0, ‘item1’: ‘water’, ‘item2’: ‘orange’, ‘item3’: ‘potato’}, {‘customer’: 3.0, ‘item1’: ‘juice’, ‘item2’: ‘mango’, ‘item3’: ‘chips’}]
In [20]: timeit df.T.to_dict().values() 1000 loops, best of 3: 395 µs per loop
In [21]: timeit df.to_dict(‘records’) 10000 loops, best of 3: 53 µs per loop
## DataFrame to dict of dict
```python
>>> df
dc_id store_id osrm_miles
12 6000 123.4
12 6001 234.5
23 5801 167.5
grp = df.groupby(['dc_id'])
return {
int(dc_id): grp.get_group(dc_id)[['store_id', 'miles']].set_index('store_id').T.to_dict('list')
for dc_id in grp.groups.keys()
} # {dc_id: {store_id: [miles]}}
DataFrame to dict
xingDict = df.set_index('stree')[['lat', 'lon']].to_dict(orient='index')
Ref: https://stackoverflow.com/questions/26716616/convert-a-pandas-dataframe-to-a-dictionary
>>> df
ID A B C
0 p 1 3 2
1 q 4 3 2
2 r 4 0 9
>>> df.set_index('ID').T.to_dict('list')
{'p': [1, 3, 2], 'q': [4, 3, 2], 'r': [4, 0, 9]}
>>> df = pd.DataFrame({'a': ['red', 'yellow', 'blue'], 'b': [0.5, 0.25, 0.125]})
>>> df
a b
0 red 0.500
1 yellow 0.250
2 blue 0.125
>>> df.to_dict('dict')
{'a': {0: 'red', 1: 'yellow', 2: 'blue'},
'b': {0: 0.5, 1: 0.25, 2: 0.125}}
>>> df.to_dict('list')
{'a': ['red', 'yellow', 'blue'],
'b': [0.5, 0.25, 0.125]}
>>> df.to_dict('series')
{'a': 0 red
1 yellow
2 blue
Name: a, dtype: object,
'b': 0 0.500
1 0.250
2 0.125
Name: b, dtype: float64}
>>> df.to_dict('split')
{'columns': ['a', 'b'],
'data': [['red', 0.5], ['yellow', 0.25], ['blue', 0.125]],
'index': [0, 1, 2]}
>>> df.to_dict('records')
[{'a': 'red', 'b': 0.5},
{'a': 'yellow', 'b': 0.25},
{'a': 'blue', 'b': 0.125}]
>>> df.to_dict('index')
{0: {'a': 'red', 'b': 0.5},
1: {'a': 'yellow', 'b': 0.25},
2: {'a': 'blue', 'b': 0.125}}
Convert from string to numeric
laneDF[['QUAD_ID','LANE_ID','STORE_NBR']] = laneDF[['QUAD_ID','LANE_ID','STORE_NBR']].apply(pd.to_numeric)
Convert list of list
to DataFrame
jsonStr = """
{
"scheduleType": 1,
"schedule": {
"dcNumber": 123,
"user": "abc",
"nbrOfAssociates": 123,
"laneInfo": [
["QUADRANT_ID", "LANE_ID", "STORE_NBR", "LANE_OPEN"],
["1", "1", "1", "Y"],
["2", "1", "2", "N"]
]
}
}
"""
upData = json.loads(jsonStr)
s = upData['schedule']
headers = s['laneInfo'].pop(0) # remove top row and use it as column title
laneDF = pd.DataFrame(s['laneInfo'], columns=headers)
iloc() and loc()
Note: in pandas version 0.20.0 and above,
ix
is deprecated and the use ofloc
andiloc
is encouraged instead.
- loc gets rows (or columns) with particular labels from the index.
- iloc gets rows (or columns) at particular positions in the index (so it only takes integers).
- ix usually tries to behave like loc but falls back to behaving like iloc if a label is not present in the index.
>>> s = pd.Series(np.nan, index=[49, 48, 1, 2, 3])
>>> s
49 NaN
48 NaN
1 NaN
2 NaN
3 NaN
>>> s.iloc[:2] # slice the first 2 rows
49 NaN
48 NaN
>>> s.loc[:2] # slice up to and including label 2
49 NaN
48 NaN
1 NaN
2 NaN
>>> s.iloc[i, 'lat'] # row by number and column by name
NaN
filter
# Take rows whose row['X'] != NaN
df = df[~pd.isna(df['X'])]