def apply(dfg):
# do stuff
return dfg
c = d6tstack.combine_csv.CombinerCSV([bigfile.csv], apply_after_read=apply, sep=',', chunksize=1e6)
# or
c = d6tstack.combine_csv.CombinerCSV(glob.glob('*.csv'), apply_after_read=apply, chunksize=1e6)
# output to various formats, automatically chunked to reduce memory consumption
c.to_csv_combine(filename='out.csv')
c.to_parquet_combine(filename='out.pq')
c.to_psql_combine('postgresql+psycopg2://usr:pwd@localhost/db', 'tablename') # fast for postgres
c.to_mysql_combine('mysql+mysqlconnector://usr:pwd@localhost/db', 'tablename') # fast for mysql
c.to_sql_combine('postgresql+psycopg2://usr:pwd@localhost/db', 'tablename') # slow but flexible
import pandas as pd
import pickle
in_path = "" #Path where the large file is
out_path = "" #Path to save the pickle files to
chunk_size = 400000 #size of chunks relies on your available memory
separator = "~"
reader = pd.read_csv(in_path,sep=separator,chunksize=chunk_size,
low_memory=False)
for i, chunk in enumerate(reader):
out_file = out_path + "/data_{}.pkl".format(i+1)
with open(out_file, "wb") as f:
pickle.dump(chunk,f,pickle.HIGHEST_PROTOCOL)
在下一步中,读入pickle并将每个pickle附加到所需的数据框架中。
import glob
pickle_path = "" #Same Path as out_path i.e. where the pickle files are
data_p_files=[]
for name in glob.glob(pickle_path + "/data_*.pkl"):
data_p_files.append(name)
df = pd.DataFrame([])
for i in range(len(data_p_files)):
df = df.append(pd.read_pickle(data_p_files[i]),ignore_index=True)
TextFileReader = pd.read_csv(path, chunksize=1000) # the number of rows per chunk
dfList = []
for df in TextFileReader:
dfList.append(df)
df = pd.concat(dfList,sort=False)
chunkTemp = []
queryTemp = []
query = pd.DataFrame()
for chunk in pd.read_csv(file, header=0, chunksize=<your_chunksize>, iterator=True, low_memory=False):
#REPLACING BLANK SPACES AT COLUMNS' NAMES FOR SQL OPTIMIZATION
chunk = chunk.rename(columns = {c: c.replace(' ', '') for c in chunk.columns})
#YOU CAN EITHER:
#1)BUFFER THE CHUNKS IN ORDER TO LOAD YOUR WHOLE DATASET
chunkTemp.append(chunk)
#2)DO YOUR PROCESSING OVER A CHUNK AND STORE THE RESULT OF IT
query = chunk[chunk[<column_name>].str.startswith(<some_pattern>)]
#BUFFERING PROCESSED DATA
queryTemp.append(query)
#! NEVER DO pd.concat OR pd.DataFrame() INSIDE A LOOP
print("Database: CONCATENATING CHUNKS INTO A SINGLE DATAFRAME")
chunk = pd.concat(chunkTemp)
print("Database: LOADED")
#CONCATENATING PROCESSED DATA
query = pd.concat(queryTemp)
print(query)
The maximum value of UNSIGNED CHAR = 255
The minimum value of SHORT INT = -32768
The maximum value of SHORT INT = 32767
The minimum value of INT = -2147483648
The maximum value of INT = 2147483647
The minimum value of CHAR = -128
The maximum value of CHAR = 127
The minimum value of LONG = -9223372036854775808
The maximum value of LONG = 9223372036854775807