Module redvox.tests.pyarrow_demo
Expand source code
import os
import timeit
import pickle
from pathlib import Path
from redvox.common import data_window as dw
def demo():
# settings.set_parallelism_enabled(True)
path = "/Users/tyler/Documents/skyfall2full/"
save_path10 = "/Users/tyler/Documents/pyarrowreadertest/large_test"
dw_config = dw.DataWindowConfig(path, structured_layout=True)
s = timeit.default_timer()
dwa = dw.DataWindow("small_test_no_save", config=dw_config, out_type="NONE")
e = timeit.default_timer()
print("nosave", e-s)
s = timeit.default_timer()
dwaz = dw.DataWindow("large_test_lz4", config=dw_config, output_dir=save_path10, out_type="LZ4")
e = timeit.default_timer()
print("lz4", e-s)
dwaz.save()
s = timeit.default_timer()
drws = dw.DataWindow("large_test", config=dw_config, output_dir=save_path10, out_type="PARQUET")
e = timeit.default_timer()
print("parquet", e-s)
drws.save()
s = timeit.default_timer()
drws = dw.DataWindow.load(os.path.join(save_path10, "large_test.json"))
e = timeit.default_timer()
print("load parquet", e-s)
s = timeit.default_timer()
dwaz = dw.DataWindow.deserialize(os.path.join(save_path10, "large_test_lz4.pkl.lz4"))
e = timeit.default_timer()
print("load lz4", e-s)
print("memory used by lz4: ", pickle.dumps(dwaz).__sizeof__())
print("memory used by nosave: ", pickle.dumps(dwa).__sizeof__())
print("memory used by parquet: ", pickle.dumps(drws).__sizeof__())
total_size = 0
for f in Path(save_path10).rglob('*.parquet'):
total_size += os.path.getsize(f)
print(f"size of parquet on disk: {total_size} B")
print(f"size of lz4 on disk: {os.path.getsize(os.path.join(save_path10, 'large_test_lz4.pkl.lz4'))} B")
if __name__ == "__main__":
demo()
Functions
def demo()
-
Expand source code
def demo(): # settings.set_parallelism_enabled(True) path = "/Users/tyler/Documents/skyfall2full/" save_path10 = "/Users/tyler/Documents/pyarrowreadertest/large_test" dw_config = dw.DataWindowConfig(path, structured_layout=True) s = timeit.default_timer() dwa = dw.DataWindow("small_test_no_save", config=dw_config, out_type="NONE") e = timeit.default_timer() print("nosave", e-s) s = timeit.default_timer() dwaz = dw.DataWindow("large_test_lz4", config=dw_config, output_dir=save_path10, out_type="LZ4") e = timeit.default_timer() print("lz4", e-s) dwaz.save() s = timeit.default_timer() drws = dw.DataWindow("large_test", config=dw_config, output_dir=save_path10, out_type="PARQUET") e = timeit.default_timer() print("parquet", e-s) drws.save() s = timeit.default_timer() drws = dw.DataWindow.load(os.path.join(save_path10, "large_test.json")) e = timeit.default_timer() print("load parquet", e-s) s = timeit.default_timer() dwaz = dw.DataWindow.deserialize(os.path.join(save_path10, "large_test_lz4.pkl.lz4")) e = timeit.default_timer() print("load lz4", e-s) print("memory used by lz4: ", pickle.dumps(dwaz).__sizeof__()) print("memory used by nosave: ", pickle.dumps(dwa).__sizeof__()) print("memory used by parquet: ", pickle.dumps(drws).__sizeof__()) total_size = 0 for f in Path(save_path10).rglob('*.parquet'): total_size += os.path.getsize(f) print(f"size of parquet on disk: {total_size} B") print(f"size of lz4 on disk: {os.path.getsize(os.path.join(save_path10, 'large_test_lz4.pkl.lz4'))} B")