import pandas as pd
try:
from StringIO import StringIO
except ImportError:
from io import StringIO
# make a big csv data file, following earlier approach by @firelynx
csvdata = """1,Alice
2,Bob
3,Caesar
"""
# we have to replicate the "integer column" user_id many many times to get
# pd.read_csv to actually chunk read. otherwise it just reads
# the whole thing in one chunk, because it's faster, and we don't get any
# "mixed dtype" issue. the 100000 below was chosen by experimentation.
csvdatafull = ""
for i in range(100000):
csvdatafull = csvdatafull + csvdata
csvdatafull = csvdatafull + "foobar,Cthlulu\n"
csvdatafull = "user_id,username\n" + csvdatafull
sio = StringIO(csvdatafull)
# the following line gives me the warning:
# C:\Users\rdisa\anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3072: DtypeWarning: Columns (0) have mixed types.Specify dtype option on import or set low_memory=False.
# interactivity=interactivity, compiler=compiler, result=result)
# but it does not always give me the warning, so i guess the internal workings of read_csv depend on background factors
x = pd.read_csv(sio, low_memory=True) #, dtype={"user_id": int, "username": "string"})
x.dtypes
# this gives:
# Out[69]:
# user_id object
# username object
# dtype: object
type(x['user_id'].iloc[0]) # int
type(x['user_id'].iloc[1]) # int
type(x['user_id'].iloc[2]) # int
type(x['user_id'].iloc[10000]) # int
type(x['user_id'].iloc[299999]) # str !!!! (even though it's a number! so this chunk must have been read in as strings)
type(x['user_id'].iloc[300000]) # str !!!!!
import warnings
# Force mixed datatype warning to be a python error so we can catch it and reattempt the
# load using the slower python engine
warnings.simplefilter('error', pandas.errors.DtypeWarning)
try:
df = pandas.read_csv(path, sep=sep, encoding=encoding)
except pandas.errors.DtypeWarning:
df = pandas.read_csv(path, sep=sep, encoding=encoding, engine="python")