Nathan Cheever
Data Scientist, AdvancedMD
1000x Faster Data Manipulation
vectorizing with Pandas and NumPy
A little about me
Data Scientist at AdvancedMD
I 💖 NLP
utahdatascientist.org
Have two kids
Claims to fame
What we’re going to talk about
Strategies for manipulating data in pandas
Follow along notebook on the repo here:
bit.ly/37IM2Kg
Is that speed gain possible?
Actual results may vary!
Data size
Time
For Loop
Vectorization
What is vectorization?
It’s catching on
Sofia Heisler’s PyCon Talk
Python for loop over rows
Looping with .iterrows()
.apply() method
Pandas series vectorization
NumPy array vectorization
bit.ly/35yZajt
>>> df['status'] = df.apply(set_lead_status, axis=1)
# 4.8 s
def set_lead_status(row):
if row['current_status'] == 'None':
return row['status_at_time_of_lead']
else:
return row['current_status']
>>> df['status'] = set_lead_status(df['cs'], df['st'])
def set_lead_status(col1, col2):
if col1 == 'None':
return col2
else:
return col1
def set_lead_status(row):
if row['cs'] == 'None':
return row['st']
else:
return row['cs']
# ValueError!: The truth value of a Series is ambiguous.
Vectorizing logical statements?
numpy.where()
vectorizing if/else
Syntax
np.where(
conditional statement → bool array,
series/array/function()/scalar if True,
series/array/function()/scalar if False
)
np.where(
df['status_at_time_of_lead'] == 'None',
df['current_status'],
df['status_at_time_of_lead']
)
>>> df['status'] = np.where(^^^)
np.where(
df['status_at_time_of_lead'] == 'None',
df['current_status'],
'NA'
)
>>> df['status'] = np.where(^^^)
>>> df['status'] = np.where(^^^)
np.where(
df['status_at_time_of_lead'].values == 'None',
df['current_status'].values,
df['status_at_time_of_lead'].values
)
df.apply method
NumPy Vectorized
4.8 s
5.8 ms
830x faster!
n = 273,399
What about
numpy.vectorize()?
>>> vectfunc = np.vectorize(set_lead_status)
>>> df['status'] = vectfunc(
df['current_status'],
df['status_at_time_of_lead']
)
# 96 ms
def set_lead_status(col1, col2):
if col1 == '- None -':
return col2
else:
return col1
What multiple conditions?
if...elif...elif...else
def lead_category(row):
if row['date_col1'] == row['date_col2']:
return 'New Lead'
elif row['norm_status'].startswith('CLI'):
return 'Client Lead'
elif row['norm_status'] in list2:
return 'MTouch Lead'
elif row['norm_status'] in list1:
return 'EMTouch Lead'
else:
return 'NA'
21
def lead_category(row):
if row['date_col1'] == row['date_col2']:
return 'New Lead'
elif row['norm_status'].startswith('CLI'):
return 'Client Lead'
elif row['norm_status'] in list2:
return 'MTouch Lead'
elif row['norm_status'] in list1:
return 'EMTouch Lead'
else:
return 'NA'
df['lead_category'] = \
np.where(df['date_col1'] == df['date_col2'], 'New Lead',
np.where(df['norm_status'].str.startswith('CLI'), 'Client Lead',
np.where(df['norm_status'].isin(list2), 'MTouch Lead',
np.where(df['norm_status'].isin(list1), 'EMTouch Lead',
'NA'))))
numpy.select()
vectorizing if...elif...else
conditions = [
df['date_col1'] == df['date_col2'],
df['norm_status'].str.startswith('CLI'),
df['norm_status'].isin(multi_touch_leads),
df['norm_status'].isin(eng_multit_leads)
]
choices = [
'New Lead',
'Client Lead',
'MTouch Lead',
'EMTouch Lead'
]
>>> df['leadcat'] = np.select(conditions, choices, default='NA')
df.apply method
NumPy Vectorized
6.9 s
82 ms
84x faster!
n = 273,399
def sub_conditional(row):
if row['Inactive'] == 'No':
if row['Providers'] == 0:
return 'active_no_providers'
elif row['Providers'] < 5:
return 'active_small'
else:
return 'active_normal'
elif row['duplicate_leads']:
return 'is_dup'
else:
if row['bad_leads']:
return 'active_bad'
else:
return 'active_good'
conditions = [
((df['Inactive'] == 'No') & (df['Providers'] == 0)),
((df['Inactive'] == 'No') & (df['Providers'] < 5)),
df['Inactive'] == 'No',
df['duplicate_leads'],
df['bad_leads'],
]
choices = [
'active_no_providers',
'active_small',
'active_normal',
'is_dup',
'active_bad',
]
>>> df['lead_type'] = np.select(conditions, choices, default='NA')
df.apply method
NumPy Vectorized
5.7 s
35.8 ms
160x faster!
n = 273,399
What about more complicated things?
working with
Strings
def find_paid_nonpaid(s):
if re.search(r'non.*?paid', s, re.I):
return 'non-paid'
elif re.search(r'Buyerzone|^paid\s+', s, re.I):
return 'paid'
else:
return 'NA'
>>> df['is_paid'] = df['Lead Source'].apply(find_paid_nonpaid)
Using pandas .str methods
conditions = [
df['Lead Source'].str.contains(r'non.*?paid', na=False),
df['Lead Source'].str.contains(r'Buyerzone|^paid\s+', na=False)
]
choices = ['non-paid', 'paid']
>>> df['is_paid'] = np.select(conditions, choices, default='NA')
Using np.vectorize
>>> vect_str = np.vectorize(find_paid_nonpaid)
>>> df['is_paid'] = vect_str(df['Lead Source'])
df.apply method
Using np.vectorize
pandas .str methods
480 ms
535 ms
493 ms
Dictionary lookups
def a_dict_lookup(row):
if row['Providers'] > 7:
return 'Upmarket'
else:
channel = channel_dict.get(row['Category'])
return channel
>>> df['dict_lookup'] = df.apply(a_dict_lookup, axis=1)
Using pandas .map method
>>> df['dict_lookup1'] = np.where(
df['Providers'] > 7,
'Upmarket',
df['Category'].map(channel_dict)
)
df.apply method
NumPy Vectorized
5.2 s
17.5 ms
294x faster!
n = 273,399
Dates
def weeks_to_complete(row):
if pd.isnull(row['Start Date']):
return (row['Original Date Created'] - row['Date Created']).days / 7
else:
return (row['Date Created'] - row['Start Date']).days / 7
>>> df['weeks_to_complete'] = df.apply(weeks_to_complete, axis=1)
Using .dt accessor
x = np.where(
df['Start Date'].isnull().values,
(df['Original Date Created'].values - df['Date Created']).dt.days / 7,
(df['Date Created'].values - df['Start Date']).dt.days / 7
)
>>> df['weeks_to_complete1'] = x
Using .dt accessor
x = np.where(df['Start Date'].isnull().values,
(df['Original Date Created'].values - df['Date Created']).dt.days / 7,
(df['Date Created'].values - df['Start Date']).dt.days / 7)
>>> df['weeks_to_complete1'] = x
ndarray type casting
y = np.where(df['Start Date'].isnull().values,
((df['Original Date Created'].values - df['Date Created'].values) \ .astype('timedelta64[D]') / np.timedelta64(1, 'D')) / 7,
((df['Date Created'].values - df['StartDate'].values) \ .astype('timedelta64[D]') / np.timedelta64(1, 'D')) / 7)
>>> df['weeks_to_complete2'] = y
df.apply method
Pandas .dt
accessor
NumPy type casting
12.7 s
19 ms
7.9 ms
1,606x faster!
Using values from other rows
def really_slow(df):
output = []
for i, row in df.iterrows():
if i > 0:
if df.iloc[i]['ID'] == df.iloc[i-1]['ID']:
if (df.iloc[i]['Date'] - df.iloc[i-1]['Date']).days > 5:
output.append(0)
else:
output.append(1)
else:
output.append(1)
else:
output.append(np.nan)
return output
=IF(A2=A1, IF(L2-L1 > 5, 0, 1), 1))
prev_id = df['ID'].shift(1).fillna(0).astype(int)
prev_date = df['Date'].shift(1).fillna(pd.Timestamp('1900'))
conditions = [
((df['Internal ID'].values == prev_id) &
(df['Date Created'] - prev_date).astype('timedelta64[D]') < 5),
df['Internal ID'].values == prev_id
]
choices = [0, 1]
>>> df['time_col1'] = np.select(conditions, choices, default=1)
df.iterrows method
NumPy Vectorized
2min 19s
11.1 ms
n = 273,399
that’s 12,523x faster!
What about more complicated...er things?
Dask
Parallel .apply()
Scaling beyond a single machine OR
more effectively parallelizing on your machine
from multiprocessing import Pool
def p_apply(df, func, cores=4):
df_split = np.array_split(df, cores)
pool = Pool(n_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
>>> df = p_apply(df, func=some_big_function)
https://bit.ly/2RnlEit
Takeaways
Vectorization can speed things up immensely!
np.where → one logical condition
np.select → 2+ logical conditions
If your logic is too much for vectorization, there are other alternatives
Really know your logic and write test cases before optimizing.
Actual results may vary!
Thanks!