Nathan Cheever
Data Scientist, AdvancedMD
Faster Data Manipulation
vectorizing with Pandas and NumPy
A little about me
Is that speed gain possible?
Actual results may vary!
Data size
Time
For Loop
Vectorization
What is vectorization?
It’s catching on
Sofia Heisler’s PyCon Talk
Python for loop over rows
Looping with .iterrows()
.apply() method
Pandas series vectorization
NumPy array vectorization
youtu.be/HN5d490_KKk
>>> df['status'] = df.apply(set_lead_status, axis=1)
# 8.1 s
def set_lead_status(row):
if row['current_status'] == 'None':
return row['status_at_time_of_lead']
else:
return row['current_status']
>>> df['status'] = set_lead_status(df['cs'], df['st'])
def set_lead_status(col1, col2):
if col1 == 'None':
return col2
else:
return col1
def set_lead_status(row):
if row['cs'] == 'None':
return row['st']
else:
return row['cs']
# ValueError!: The truth value of a Series is ambiguous.
Vectorizing logical statements?
numpy.where()
vectorizing if/else
Syntax
np.where(
conditional statement → bool array,
series/array/function()/scalar if True,
series/array/function()/scalar if False
)
np.where(
df['status_at_time_of_lead'] == 'None',
df['current_status'],
df['status_at_time_of_lead']
)
>>> df['status'] = np.where(^^^)
np.where(
df['status_at_time_of_lead'] == 'None',
df['current_status'],
'NA'
)
>>> df['status'] = np.where(^^^)
>>> df['status'] = np.where(^^^)
np.where(
df['status_at_time_of_lead'].values == 'None',
df['current_status'].values,
df['status_at_time_of_lead'].values
)
df.apply method
NumPy Vectorized
8.1 s
8.8 ms
920x faster!
n = 273,399
What about
numpy.vectorize()?
>>> vectfunc = np.vectorize(set_lead_status)
>>> df['status'] = vectfunc(
df['current_status'],
df['status_at_time_of_lead']
)
# 137 ms
def set_lead_status(col1, col2):
if col1 == '- None -':
return col2
else:
return col1
What multiple conditions?
if...elif...elif...else
def lead_category(row):
if row['date_col1'] == row['date_col2']:
return 'New Lead'
elif row['norm_status'].startswith('CLI'):
return 'Client Lead'
elif row['norm_status'] in list2:
return 'MTouch Lead'
elif row['norm_status'] in list1:
return 'EMTouch Lead'
else:
return 'NA'
20
def lead_category(row):
if row['date_col1'] == row['date_col2']:
return 'New Lead'
elif row['norm_status'].startswith('CLI'):
return 'Client Lead'
elif row['norm_status'] in list2:
return 'MTouch Lead'
elif row['norm_status'] in list1:
return 'EMTouch Lead'
else:
return 'NA'
df['lead_category'] = \
np.where(df['date_col1'] == df['date_col2'], 'New Lead',
np.where(df['norm_status'].str.startswith('CLI'), 'Client Lead',
np.where(df['norm_status'].isin(list2), 'MTouch Lead',
np.where(df['norm_status'].isin(list1), 'EMTouch Lead',
'NA'))))
numpy.select()
vectorizing if...elif...else
conditions = [
df['date_col1'] == df['date_col2'],
df['norm_status'].str.startswith('CLI'),
df['norm_status'].isin(multi_touch_leads),
df['norm_status'].isin(eng_multit_leads)
]
choices = [
'New Lead',
'Client Lead',
'MTouch Lead',
'EMTouch Lead'
]
>>> df['leadcat'] = np.select(conditions, choices, default='NA')
df.apply method
NumPy Vectorized
12.5 s
140 ms
89x faster!
n = 273,399
def sub_conditional(row):
if row['Inactive'] == 'No':
if row['Providers'] == 0:
return 'active_no_providers'
elif row['Providers'] < 5:
return 'active_small'
else:
return 'active_normal'
elif row['duplicate_leads']:
return 'is_dup'
else:
if row['bad_leads']:
return 'active_bad'
else:
return 'active_good'
conditions = [
((df['Inactive'] == 'No') & (df['Providers'] == 0)),
((df['Inactive'] == 'No') & (df['Providers'] < 5)),
df['Inactive'] == 'No',
df['duplicate_leads'],
df['bad_leads'],
]
choices = [
'active_no_providers',
'active_small',
'active_normal',
'is_dup',
'active_bad',
]
>>> df['lead_type'] = np.select(conditions, choices, default='NA')
df.apply method
NumPy Vectorized
9.4 s
24.8 ms
380x faster!
n = 273,399
What about more complicated things?
working with
Strings
def find_paid_nonpaid(s):
if re.search(r'non.*?paid', s, re.I):
return 'non-paid'
elif re.search(r'Buyerzone|^paid\s+', s, re.I):
return 'paid'
else:
return 'NA'
>>> df['is_paid'] = df['Lead Source'].apply(find_paid_nonpaid)
Using pandas .str methods
conditions = [
df['Lead Source'].str.contains(r'non.*?paid', na=False),
df['Lead Source'].str.contains(r'Buyerzone|^paid\s+', na=False)
]
choices = ['non-paid', 'paid']
>>> df['is_paid'] = np.select(conditions, choices, default='NA')
Using np.vectorize
>>> vect_str = np.vectorize(find_paid_nonpaid)
>>> df['is_paid'] = vect_str(df['Lead Source'])
df.apply method
Using np.vectorize
pandas .str methods
540 ms
563 ms
752 ms
Dictionary lookups
def a_dict_lookup(row):
if row['Providers'] > 7:
return 'Upmarket'
else:
channel = channel_dict.get(row['Category'])
return channel
>>> df['dict_lookup'] = df.apply(a_dict_lookup, axis=1)
Using pandas .map method
>>> df['dict_lookup1'] = np.where(
df['Providers'] > 7,
'Upmarket',
df['Category'].map(channel_dict)
)
df.apply method
NumPy Vectorized
7.84 s
26.6 ms
295x faster!
n = 273,399
Dates
def weeks_to_complete(row):
if pd.isnull(row['Start Date']):
return (row['Original Date Created'] - row['Date Created']).days / 7
else:
return (row['Date Created'] - row['Start Date']).days / 7
>>> df['weeks_to_complete'] = df.apply(weeks_to_complete, axis=1)
Using .dt accessor
x = np.where(df['Start Date'].isnull().values,
(df['Original Date Created'].values - df['Date Created']).dt.days / 7,
(df['Date Created'].values - df['Start Date']).dt.days / 7)
>>> df['weeks_to_complete1'] = x
Using .dt accessor
x = np.where(df['Start Date'].isnull().values,
(df['Original Date Created'].values - df['Date Created']).dt.days / 7,
(df['Date Created'].values - df['Start Date']).dt.days / 7)
>>> df['weeks_to_complete1'] = x
ndarray type casting
y = np.where(df['Start Date'].isnull().values,
((df['Original Date Created'].values - df['Date Created'].values) \ .astype('timedelta64[D]') / np.timedelta64(1, 'D')) / 7,
((df['Date Created'].values - df['StartDate'].values) \ .astype('timedelta64[D]') / np.timedelta64(1, 'D')) / 7)
>>> df['weeks_to_complete2'] = y
df.apply method
Pandas .dt
accessor
NumPy type casting
20.5 s
24 ms
12.8 ms
1,601x faster!
Using values from other rows
def really_slow(df):
output = []
for i, row in df.iterrows():
if i > 0:
if df.iloc[i]['ID'] == df.iloc[i-1]['ID']:
if (df.iloc[i]['Date'] - df.iloc[i-1]['Date']).days > 5:
output.append(0)
else:
output.append(1)
else:
output.append(1)
else:
output.append(np.nan)
return output
=IF(A2=A1, IF(L2-L1 < 5, 0, 1), 1))
prev_id = df['ID'].shift(1).fillna(0).astype(int)
prev_date = df['Date'].shift(1).fillna(pd.Timestamp('1900'))
conditions = [
((df['Internal ID'].values == prev_id) &
(df['Date Created'] - prev_date).astype('timedelta64[D]') < 5),
df['Internal ID'].values == prev_id
]
choices = [0, 1]
>>> df['time_col1'] = np.select(conditions, choices, default=1)
df.iterrows method
NumPy Vectorized
3min 17s
17.9 ms
n = 273,399
that’s 11,006x faster!
What about more complicated...er things?
Parallel .apply()
Dask
from multiprocessing import Pool
def p_apply(df, func, cores=4):
df_split = np.array_split(df, cores)
pool = Pool(n_cores)
df = pd.concat(pool.map(func, df_split))
pool.close()
pool.join()
return df
>>> df = p_apply(df, func=some_big_function)
Scaling beyond a single machine OR
more effectively parallelizing on your machine
Takeaways
Vectorization can speed things up immensely!
np.where → one logical condition
np.select → 2+ logical conditions
If your logic is too much for vectorization, there are other alternatives
Really know your logic and write test cases before optimizing.
Actual results may vary!
Thanks!