Q? Which one is better?

  • Appending a small dataframe to a list Vs.
  • Concatinating small data frame first, then append to list
import sys
import time
import pandas as pd
import numpy as np

def get_size(obj, seen=None):
    """Recursively finds size of objects"""
    size = sys.getsizeof(obj)
    if seen is None:
        seen = set()
    obj_id = id(obj)
    if obj_id in seen:
        return 0
    # Important mark as seen *before* entering recursion to gracefully handle
    # self-referential objects
    seen.add(obj_id)
    if isinstance(obj, dict):
        size += sum([get_size(v, seen) for v in obj.values()])
        size += sum([get_size(k, seen) for k in obj.keys()])
    elif hasattr(obj, '__dict__'):
        size += get_size(obj.__dict__, seen)
    elif hasattr(obj, '__iter__') and not isinstance(obj, (str, bytes, bytearray)):
        size += sum([get_size(i, seen) for i in obj])
    return size
# create a dataframe with million records (rows) and 20 columns
df = pd.DataFrame(np.random.randn(1000000,20)) #million 

def f1():
    result = []
    for i in range(10):
        result.append(df)
    dfinal= pd.concat(result,ignore_index=True)
    return dfinal

def f2():
    result = []
    result.append(pd.concat([df, df, df, df, df],  axis=0))
    result.append(pd.concat([df, df, df, df, df],  axis=0))
    dfinal = pd.concat(result,ignore_index=True)
    return dfinal

f1().equals(f2())
True
st1 = time.time()
df1 = f1()
et1 = time.time()
print("Time to append small df to list: ", (et1-st1), "Final Size:", get_size(df1)/1e9, "GB")
Time to append small df to list:  0.9832870960235596 Final Size: 3.480015827 GB
st2 = time.time()
df2 = f2()
et2 = time.time()
print("Time to append concated df to list: ", (et2-st2), "Final Size:", get_size(df1)/1e9,"GB")
Time to append concated df to list:  2.1953396797180176 Final Size: 3.480015827 GB
print("f2 is ", (et1-st1)/(et2-st2), "times fastter than f1")
f2 is  0.44789747350162173 times fastter than f1