Source code for ielearn.batch_io

"""
A collection of utility functions for file i/o.
"""
import os
from abc import ABCMeta

import pandas as pd


[docs]def extension(fn): """extension :param fn: """ return os.path.splitext(fn)[1]
[docs]class Factory(object): """ Abstract factory class to dynamically return a specific internal instance. Subclasses of this class should define the internal classes supported. """ __metaclass__ = ABCMeta # defined by subclasses _classes = {} @staticmethod def __new__(cls, *args, **kwargs): """ Return an instantiated instance of the correct type. :param name: Name of the class to create. :param args: Positional arguments to pass to the class constructor. :param kwargs: Keyword arguments to pass to the class constructor. :return: Initialized instance. """ # check the mapping from name to class pointer. class_name = args[0] algo = cls._classes.get(class_name) # if a bad key is given, raise an error. if not algo: raise ValueError("The requested class {} does not exist." "Supported classes are {}".format(class_name, cls._classes.keys())) # if all is well, initialize the instance and return it. return algo(*args[1:], **kwargs)
[docs]class BatchFileHandler(object): DEFAULT_BATCH_SIZE = 5000 JOIN_FUNCS = { ".pkl": pd.concat, ".csv": pd.concat } SUPPORTED_EXTENSIONS = (".pkl", ".csv") def __init__(self, file_ext, batch_size): self.batch_size = batch_size or self.DEFAULT_BATCH_SIZE if file_ext not in self.SUPPORTED_EXTENSIONS: raise ValueError("Invalid file type specified: {}".format(file_ext)) self.file_ext = file_ext self.join_func = BatchFileHandler.JOIN_FUNCS[self.file_ext]
[docs] def verify_path(self, fn): """verify_path :param fn: """ if not os.path.isfile(fn) or not os.path.exists(fn): raise IOError("Invalid path given for read: {}".format(fn))
def __enter__(self): """__enter__""" return self def __exit__(self, exc_type, exc_value, exc_tb): """__exit__ :param exc_type: :param exc_value: :param exc_tb: """ if exc_value: raise exc_value
[docs]class BatchFileReader(BatchFileHandler): """ Reads files into a DataFrame in a batched fashion. """ READ_FUNCS = { ".pkl": pd.read_pickle, ".csv": pd.read_csv } def __init__(self, fns, batch_size=None): super(BatchFileReader, self).__init__(extension(fns[0]), batch_size) for fn in fns: self.verify_path(fn) self.fns = fns self.read_func = self.READ_FUNCS[self.file_ext]
[docs] def read(self): """read""" return self.join_func([ self.read_func(fn) for fn in self.fns ])
[docs]class BatchFileWriter(BatchFileHandler): """ Writes a DataFrame to files in a batched fashion. """ WRITE_FUNCS = { ".pkl": pd.DataFrame.to_pickle, ".csv": pd.DataFrame.to_csv } def __init__(self, fn, batch_size=None): """__init__ :param df: :param fn: :param batch_size: """ super(BatchFileWriter, self).__init__(extension(fn), batch_size) self.fn = fn self.write_func = self.WRITE_FUNCS[self.file_ext]
[docs] def write(self, df): """write :param df: """ self.write_func(df, self.fn)
[docs]class BatchFileIO(Factory): """ Factory to return a batch reader/writer. """ _classes = { "r": BatchFileReader, "w": BatchFileWriter }
[docs]def demo(): """demo""" # batch read from glob import glob data = BatchFileIO('r', glob("*.pkl")).read() # print data.shape # batch write BatchFileIO('w', 'new_file.pkl').write(data)
# print os.path.exists('new_file.pkl') if __name__ == '__main__': demo()