import dask.dataframe as dd
def your_func(row):
#do something
return row
ddf = dd.from_pandas(df, npartitions=30) # find your own number of partitions
ddf_update = ddf.apply(your_func, axis=1).compute()
在我的10万张唱片里,没有戴斯克:
CPU 时间: 用户6分钟32秒,系统: 100毫秒,总计: 6分钟32秒
墙壁时间: 6分32秒
戴斯克:
CPU 时间: user 5.19 s,sys: 784 ms,total: 5.98 s
墙壁时间: 1分3秒
from joblib import parallel_backend, Parallel, delayed, effective_n_jobs
from sklearn.utils import gen_even_slices
from sklearn.utils.validation import _num_samples
def parallel_apply(df, func, n_jobs= -1, **kwargs):
""" Pandas apply in parallel using joblib.
Uses sklearn.utils to partition input evenly.
Args:
df: Pandas DataFrame, Series, or any other object that supports slicing and apply.
func: Callable to apply
n_jobs: Desired number of workers. Default value -1 means use all available cores.
**kwargs: Any additional parameters will be supplied to the apply function
Returns:
Same as for normal Pandas DataFrame.apply()
"""
if effective_n_jobs(n_jobs) == 1:
return df.apply(func, **kwargs)
else:
ret = Parallel(n_jobs=n_jobs)(
delayed(type(df).apply)(df[s], func, **kwargs)
for s in gen_even_slices(_num_samples(df), effective_n_jobs(n_jobs)))
return pd.concat(ret)
用法: result = parallel_apply(my_dataframe, my_func)
import numpy as np
import multiprocessing as mp
dfs = np.array_split(df, 8000) # divide the dataframe as desired
def f_app(df):
return df.apply(myfunc, axis=1)
with mp.Pool(mp.cpu_count()) as pool:
res = pd.concat(pool.map(f_app, dfs))
import multiprocessing as mp
def f(df):
""" the function that you want to apply to each column """
column_name = df.columns[0] # this is the same as the original column name
# do something what you need to do to that column
return df
# Here I just make a list of all the columns. If you don't use .to_frame()
# it will pass series type instead of a dataframe
dfs = [df[column].to_frame() for column in df.columns]
with mp.Pool(mp.cpu_num) as pool:
processed_df = pd.concat(pool.map(f, dfs), axis=1)