Convert a pandas dataframe in a numpy array, store data in a file HDF5 and return as numpy array or dataframe.

In [108]:

import pandas as pd
import numpy as np
import h5py

In [109]:

np.random.seed(1234)
df = pd.DataFrame(np.random.randn(6,4),columns=list('ABCD'))
df

Out[109]:

	A	B	C	D
0	0.471435	-1.190976	1.432707	-0.312652
1	-0.720589	0.887163	0.859588	-0.636524
2	0.015696	-2.242685	1.150036	0.991946
3	0.953324	-2.021255	-0.334077	0.002118
4	0.405453	0.289092	1.321158	-1.546906
5	-0.202646	-0.655969	0.193421	0.553439

6 rows × 4 columns

In [110]:

# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.as_matrix.html#pandas.DataFrame.as_matrix
df.as_matrix()

Out[110]:

array([[  4.71435164e-01,  -1.19097569e+00,   1.43270697e+00,
         -3.12651896e-01],
       [ -7.20588733e-01,   8.87162940e-01,   8.59588414e-01,
         -6.36523504e-01],
       [  1.56963721e-02,  -2.24268495e+00,   1.15003572e+00,
          9.91946022e-01],
       [  9.53324128e-01,  -2.02125482e+00,  -3.34077366e-01,
          2.11836468e-03],
       [  4.05453412e-01,   2.89091941e-01,   1.32115819e+00,
         -1.54690555e+00],
       [ -2.02646325e-01,  -6.55969344e-01,   1.93421376e-01,
          5.53438911e-01]])

In [111]:

# http://stackoverflow.com/questions/13187778/pandas-dataframe-to-numpy-array-include-index
# http://pandas.pydata.org/pandas-docs/dev/generated/pandas.DataFrame.to_records.html?highlight=to_record#pandas.DataFrame.to_records
df_to_nparray = df.to_records(index=False)
df_to_nparray

Out[111]:

rec.array([ (0.47143516373249306, -1.1909756947064645, 1.4327069684260973, -0.3126518960917129),
       (-0.7205887333650116, 0.8871629403077386, 0.8595884137174165, -0.6365235044173491),
       (0.015696372114428918, -2.2426849541854055, 1.150035724719818, 0.9919460223426778),
       (0.9533241281124304, -2.0212548201949705, -0.334077365808097, 0.002118364683486495),
       (0.405453411570191, 0.2890919409800353, 1.3211581921293856, -1.5469055532292402),
       (-0.2026463246291819, -0.6559693441389339, 0.19342137647035826, 0.5534389109567419)], 
      dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8'), ('D', '<f8')])

In [112]:

# http://docs.h5py.org/en/latest/high/file.html
# http://blog.tremily.us/posts/HDF5/
# http://www.sam.math.ethz.ch/~raoulb/teaching/PythonTutorial/data_storage.html

# initialize file
# 'a' ->  Read/write if exists, create otherwise (default)
f = h5py.File('tuto_myfile.hdf5','a')

# create dataset
f['dset'] = df_to_nparray

# close connection to file
f.close()

In order to evaluate the HDF5 file you should install 'hdf5-tools'.

In Ubuntu system:

$ sudo apt-get install hdf5-tools

And try:

$ h5dump tuto_myfile.hdf5

You'll retrieve something like:

$ h5dump tuto_myfile.hdf5 
HDF5 "tuto_myfile.hdf5" {
GROUP "/" {
   DATASET "dset" {
      DATATYPE  H5T_COMPOUND {
         H5T_IEEE_F64LE "A";
         H5T_IEEE_F64LE "B";
         H5T_IEEE_F64LE "C";
         H5T_IEEE_F64LE "D";
      }
      DATASPACE  SIMPLE { ( 6 ) / ( 6 ) }
      DATA {
      (0): {
            0.471435,
            -1.19098,
            1.43271,
            -0.312652
         },
      (1): {
            -0.720589,
            0.887163,
            0.859588,
            -0.636524
         },
      (2): {
            0.0156964,
            -2.24268,
            1.15004,
            0.991946
         },
      (3): {
            0.953324,
            -2.02125,
            -0.334077,
            0.00211836
         },
      (4): {
            0.405453,
            0.289092,
            1.32116,
            -1.54691
         },
      (5): {
            -0.202646,
            -0.655969,
            0.193421,
            0.553439
         }
      }
   }
}
}

In [113]:

# read from hdf5

# open file
# 'r' -> Readonly, file must exist
f = h5py.File('tuto_myfile.hdf5', 'r')

# load dataset: dset
dset = f['dset']
dset

Out[113]:

<HDF5 dataset "dset": shape (6,), type "|V32">

In [114]:

a = dset[...]
f.close()

In [115]:

Out[115]:

array([ (0.47143516373249306, -1.1909756947064645, 1.4327069684260973, -0.3126518960917129),
       (-0.7205887333650116, 0.8871629403077386, 0.8595884137174165, -0.6365235044173491),
       (0.015696372114428918, -2.2426849541854055, 1.150035724719818, 0.9919460223426778),
       (0.9533241281124304, -2.0212548201949705, -0.334077365808097, 0.002118364683486495),
       (0.405453411570191, 0.2890919409800353, 1.3211581921293856, -1.5469055532292402),
       (-0.2026463246291819, -0.6559693441389339, 0.19342137647035826, 0.5534389109567419)], 
      dtype=[('A', '<f8'), ('B', '<f8'), ('C', '<f8'), ('D', '<f8')])

In [116]:

# http://pandas.pydata.org/pandas-docs/dev/io.html#hdf5-pytables

# Reading hdf5 in pandas
df2 = pd.read_hdf('tuto_myfile.hdf5', 'dset')

In [117]:

df2

Out[117]:

	A	B	C	D
0	0.471435	-1.190976	1.432707	-0.312652
1	-0.720589	0.887163	0.859588	-0.636524
2	0.015696	-2.242685	1.150036	0.991946
3	0.953324	-2.021255	-0.334077	0.002118
4	0.405453	0.289092	1.321158	-1.546906
5	-0.202646	-0.655969	0.193421	0.553439

6 rows × 4 columns

In [118]:

# cleanup the mess (comment if needed)
! rm -f tuto_myfile.hdf5

ps.: I know, I know...pandas can store directly in HDF5: http://pandas.pydata.org/pandas-docs/dev/io.html#io-hdf5

;)