/ Published in: Python
Expand |
Embed | Plain Text
import numpy import os import cPickle as pickle from datetime import datetime def dt2sec(dt): return dt.microseconds / 1000.0 + dt.seconds data = [ { "a" : numpy.arange( 100, dtype=float ), "b" : numpy.arange( 100, dtype=float ).reshape(10,10) } for i in range(10)] pickle_name = "data.pickle" ff_dir = "fromfile" os.mkdir( ff_dir ) #----- write data print "write data" t0 = datetime.now() pickle.dump(data, file(pickle_name,"w")) t1 = datetime.now() print "pickle.dump :", dt2sec(t1-t0) t0 = datetime.now() for i,d in enumerate(data): idir = os.path.join(ff_dir,str(i)) os.mkdir( idir ) for name, array in d.items(): # write to fromfile/{i}/{name} array.tofile(os.path.join(idir,name)) t1 = datetime.now() print "tofile :", dt2sec(t1-t0) #----- read data print "read data" t0 = datetime.now() pickle_data = pickle.load(file(pickle_name)) #print pickle_data t1 = datetime.now() print "pickle.load :", dt2sec(t1-t0) t0 = datetime.now() ff_data = [] for i in os.listdir(ff_dir): idir = os.path.join(ff_dir,i) tmp = {} for name in os.listdir(idir): tmp[name] = numpy.fromfile(os.path.join(idir,name)) ff_data.append(tmp) #print ff_data t1 = datetime.now() print "fromfile :", dt2sec(t1-t0) ## write data ## pickle.dump : 9.328 ## tofile : 2.946 ## read data ## pickle.load : 15.423 ## fromfile : 1.858
Comments
Subscribe to comments
You need to login to post a comment.

When the data set increases to like thousands or millions, the tofile and fromfile would probably suffer on IO operations. Have you tested on bigger data set? Say 100,000 records? Also creating a dir for every entry seems too much overhead and will be limited on OS resources I think.
I think you really want to use pickle.HIGHEST_PROTOCOL. Its makes a factor ten or twenty difference (on top of the cPickle/pickle difference) for both writing and reading.