fathom/moab-docs/ReadHDF5VarLen_8cpp_source.html

00001 /** \file   ReadHDF5VarLen.cpp
00002  *  \author Jason Kraftcheck
00003  *  \date   2010-09-04
00004  */
00005
00006 #include "ReadHDF5VarLen.hpp"
00007 #include "ReadHDF5Dataset.hpp"
00008 #include "H5Tpublic.h"
00009 #include <cassert>
00010
00011 namespace moab
00012 {
00013
00014 bool ReadHDF5VarLen::is_ranged( EntityHandle file_id,
00015                                 Range::const_iterator& ranged_iter,
00016                                 Range::const_iterator range_end )
00017 {
00018     if( ranged_iter == range_end ) return false;
00019
00020     assert( file_id <= *ranged_iter );
00021     if( *ranged_iter != file_id ) return false;
00022
00023     ++ranged_iter;
00024     return true;
00025 }
00026
00027 ErrorCode ReadHDF5VarLen::read_data( ReadHDF5Dataset& data_set,
00028                                      const Range& offsets,
00029                                      EntityHandle start_offset,
00030                                      hid_t data_type,
00031                                      const Range& file_ids,
00032                                      const std::vector< unsigned >& vals_per_ent,
00033                                      const Range& ranged_file_ids )
00034 {
00035     ErrorCode rval;
00036     const size_t value_size          = H5Tget_size( data_type );
00037     const size_t buffer_size         = bufferSize / value_size;
00038     unsigned char* const data_buffer = reinterpret_cast< unsigned char* >( dataBuffer );
00039     std::vector< unsigned char > partial;  // for when we read only part of the contents of a set/entity
00040     Range::const_iterator fileid_iter                  = file_ids.begin();
00041     Range::const_iterator ranged_iter                  = ranged_file_ids.begin();
00042     std::vector< unsigned >::const_iterator count_iter = vals_per_ent.begin();
00043     size_t count, offset;
00044     bool ranged;
00045     int nn = 0;
00046
00047     assert( file_ids.size() == vals_per_ent.size() );
00048
00049     try
00050     {
00051         data_set.set_file_ids( offsets, start_offset, buffer_size, data_type );
00052     }
00053     catch( ReadHDF5Dataset::Exception& )
00054     {
00055         return MB_FAILURE;
00056     }
00057
00058     dbgOut.printf( 3, "Reading %s in %lu chunks\n", data_set.get_debug_desc(), data_set.get_read_count() );
00059
00060     while( !data_set.done() )
00061     {
00062         dbgOut.printf( 3, "Reading chunk %d of %s\n", ++nn, data_set.get_debug_desc() );
00063         try
00064         {
00065             data_set.read( data_buffer, count );
00066         }
00067         catch( ReadHDF5Dataset::Exception& )
00068         {
00069             return MB_FAILURE;
00070         }
00071
00072         assert( 0 == count || fileid_iter != file_ids.end() );
00073
00074         // Handle 'special' case where we read some, but not all
00075         // of the data for an entity during the last iteration.
00076         offset = 0;
00077         if( !partial.empty() )
00078         {  // didn't read all of previous entity
00079             assert( fileid_iter != file_ids.end() );
00080             assert( 0 == ( partial.size() % value_size ) );
00081             size_t num_prev = partial.size() / value_size;
00082             offset          = *count_iter - num_prev;
00083             if( offset > count )
00084             {  // still don't have all
00085                 partial.insert( partial.end(), data_buffer, data_buffer + count * value_size );
00086                 continue;
00087             }
00088
00089             partial.insert( partial.end(), data_buffer, data_buffer + offset * value_size );
00090
00091             ranged = is_ranged( *fileid_iter, ranged_iter, ranged_file_ids.end() );
00092             assert( partial.size() == *count_iter * value_size );
00093             rval = store_data( *fileid_iter, &partial[0], *count_iter, ranged );
00094             if( MB_SUCCESS != rval ) return rval;
00095
00096             ++count_iter;
00097             ++fileid_iter;
00098             partial.clear();
00099         }
00100
00101         // Process contents for all entities for which we
00102         // have read the complete list
00103         while( count_iter != vals_per_ent.end() && offset + *count_iter <= count )
00104         {
00105             assert( fileid_iter != file_ids.end() );
00106             ranged = is_ranged( *fileid_iter, ranged_iter, ranged_file_ids.end() );
00107             rval   = store_data( *fileid_iter, data_buffer + offset * value_size, *count_iter, ranged );
00108             if( MB_SUCCESS != rval ) return rval;
00109
00110             offset += *count_iter;
00111             ++count_iter;
00112             ++fileid_iter;
00113         }
00114
00115         // If we did not read all of the final entity,
00116         // store what we did read to be processed in the
00117         // next iteration
00118         if( offset < count )
00119         {
00120             assert( partial.empty() );
00121             partial.insert( partial.end(), data_buffer + offset * value_size, data_buffer + count * value_size );
00122         }
00123     }
00124     // NOTE: If the last set is empty, we will not process it here
00125     // assert(fileid_iter == file_ids.end());
00126 #ifndef NDEBUG
00127     for( ; fileid_iter != file_ids.end(); ++fileid_iter )
00128     {
00129         assert( 0 == *count_iter );
00130         ++count_iter;
00131     }
00132 #endif
00133     return MB_SUCCESS;
00134 }
00135 /*
00136 ErrorCode ReadHDF5VarLen::read_offsets( ReadHDF5Dataset& data_set,
00137                                         const Range& file_ids,
00138                                         EntityHandle start_file_id,
00139                                         unsigned num_columns,
00140                                         const unsigned indices[],
00141                                         EntityHandle nudge,
00142                                         Range offsets_out[],
00143                                         std::vector<unsigned> counts_out[],
00144                                         Range* ranged_file_ids = 0 )
00145 {
00146   const int local_index = 1;
00147
00148     // sanity check
00149   const unsigned max_cols = ranged_file_ids ? data_set.columns() - 1 : data_set.columns()
00150   for (unsigned i = 0; i < num_columns; ++i) {
00151     assert(indices[i] >= max_cols);
00152     if (indices[i] >= max_cols)
00153       return MB_FAILURE;
00154  }
00155
00156     // Use hints to make sure insertion into ranges is O(1)
00157   std::vector<Range::iterator> hints;
00158   if (ranged_file_ids) {
00159     hints.resize( num_colums + 1 );
00160     hints.back() = ranged_file_ids->begin();
00161   }
00162   else {
00163     hints.resize( num_columns );
00164   }
00165   for (unsigned i = 0; i < num_columns; ++i)
00166     offsets_out[i].clear();
00167     counts_out[i].clear();
00168     counts_out[i].reserve( file_ids.size() );
00169     hints[i] = offsets_out[i].begin();
00170   }
00171
00172     // If we only need one column from a multi-column data set,
00173     // then read only that column.
00174   if (num_columns == 1 && data_set.columns() > 1 && !ranged_file_ids) {
00175     data_set.set_column( indices[0] );
00176     indices = &local_index;
00177   }
00178   else if (ranged_file_ids && data_set.columns() > 1 && 0 == num_columns) {
00179     data_set.set_column( data_set.columns() - 1 );
00180   }
00181     // NOTE: do not move this above the previous block.
00182     //       The previous block changes the results of data_set.columns()!
00183   const size_t table_columns = data_set.columns();
00184
00185     // Calculate which rows we need to read from the offsets table
00186   Range rows;
00187   Range::iterator hint = rows.begin();
00188   Range::const_pair_iterator pair = file_ids.const_pair_begin();
00189     // special case if reading first entity in dataset, because
00190     // there is no previous end value.
00191   if (pair != file_ids.const_pair_end() && pair->first == start_file_id)
00192     hint = rows.insert( nudge, pair->second - start_file_id + nudge );
00193   while (pair != file_ids.const_pair_end()) {
00194     hint = rows.insert( hint,
00195                         pair->first + nudge - 1 - start_file_id,
00196                         pair->second + nudge - start_file_id );
00197     ++pair;
00198   }
00199
00200     // set up read of offsets dataset
00201   hsize_t buffer_size = bufferSize / (sizeof(hssize_t) * data_set.columns());
00202   hssize_t* buffer = reinterpret_cast<hssize_t*>(dataBuffer);
00203   data_set.set_file_ids( rows, nudge, buffer_size, H5T_NATIVE_HSSIZE );
00204   std::vector<hssize_t> prev_end;
00205     // If we're reading the first row of the table, then the
00206     // previous end is implicitly -1.
00207   if (!file_ids.empty() && file_ids.front() == start_file_id)
00208     prev_end.resize(num_columns,-1);
00209
00210     // read offset table
00211   size_t count, offset;
00212   Range::const_iterator fiter = file_ids.begin();
00213   while (!data_set.done()) {
00214     try {
00215       data_set.read( buffer, count );
00216     }
00217     catch (ReadHDF5Dataset::Exception e) {
00218       return MB_FAILURE;
00219     }
00220     if (!count) // might have been NULL read for collective IO
00221       continue;
00222
00223       // If the previous end values were read in the previous iteration,
00224       // then they're stored in prev_end.
00225     size_t offset = 0;
00226     if (!prev_end.empty()) {
00227        for (unsigned i = 0; i < num_columns; ++i) {
00228         counts_out[i].push_back( buffer[indices[i]] - prev_end[i] );
00229         hints[i] = offsets_out[i].insert( hints[i],
00230                                           prev_end[i] + 1 + nudge,
00231                                           buffer[indices[i]] + nudge );
00232       }
00233       if (ranged_file_ids && (buffer[table_columns-1] & mhdf_SET_RANGE_BIT))
00234         hints.back() = ranged_file_ids->insert( hints.back(), *fiter );
00235       ++fiter;
00236       offset = 1;
00237       prev_end.clear();
00238     }
00239
00240     while (offset < count) {
00241       assert(fiter != file_ids.end());
00242         // whenever we get to a gap between blocks we need to
00243         // advance one step because we read an extra end id
00244         // preceding teah block
00245       if (fiter == fiter.start_of_block()) {
00246         if (offset == count-1)
00247           break;
00248         ++offset;
00249       }
00250
00251       for (unsigned i = 0; i < num_columns; ++i) {
00252         size_t s = buffer[(offset-1)*table_columns+indices[i]] + 1;
00253         size_t e = buffer[ offset   *table_columns+indices[i]];
00254         counts_out.push_back( e - s + 1 );
00255         hints[i] = offsets_out.insert( hints[i], s, e );
00256       }
00257       if (ranged_file_ids && (buffer[offset*table_columns+table_columns-1] & mhdf_SET_RANGE_BIT))
00258         hints.back() = ranged_file_ids->insert( hints.back(), *fiter );
00259
00260       ++fiter;
00261       ++offset;
00262     }
00263
00264       // If we did not end on the boundary between two blocks,
00265       // then we need to save the end indices for the final entry
00266       // for use in the next iteration.  Similarly, if we ended
00267       // with extra values that were read with the express intention
00268       // of getting the previous end values for a block, we need to
00269       // save them.  This case only arises if we hit the break in
00270       // the above loop.
00271     if (fiter != fiter.start_of_block() || offset < count) {
00272       assert(prev_end.empty());
00273       if (offset == count) {
00274         --offset;
00275         assert(fiter != fiter.start_of_block());
00276       }
00277       else {
00278         assert(offset+1 == count);
00279         assert(fiter == fiter.start_of_block());
00280       }
00281       for (unsigned i = 0; i < num_columns; ++i)
00282         prev_end.push_back(buffer[offset*table_columns+indices[i]]);
00283     }
00284   }
00285   assert(prev_end.empty());
00286   assert(fiter == file_ids.end());
00287
00288   return MB_SUCCESS;
00289 }
00290 */
00291 ErrorCode ReadHDF5VarLen::read_offsets( ReadHDF5Dataset& data_set,
00292                                         const Range& file_ids,
00293                                         EntityHandle start_file_id,
00294                                         EntityHandle nudge,
00295                                         Range& offsets_out,
00296                                         std::vector< unsigned >& counts_out )
00297 {
00298
00299     // Use hints to make sure insertion into ranges is O(1)
00300     offsets_out.clear();
00301     counts_out.clear();
00302     counts_out.reserve( file_ids.size() );
00303     Range::iterator hint;
00304
00305     // Calculate which rows we need to read from the offsets table
00306     Range rows;
00307     hint                            = rows.begin();
00308     Range::const_pair_iterator pair = file_ids.const_pair_begin();
00309     // special case if reading first entity in dataset, because
00310     // there is no previous end value.
00311     if( pair != file_ids.const_pair_end() && pair->first == start_file_id )
00312     {
00313         hint = rows.insert( nudge, pair->second - start_file_id + nudge );
00314         ++pair;
00315     }
00316     while( pair != file_ids.const_pair_end() )
00317     {
00318         hint = rows.insert( hint, pair->first - start_file_id + nudge - 1, pair->second - start_file_id + nudge );
00319         ++pair;
00320     }
00321
00322     // set up read of offsets dataset
00323     hsize_t buffer_size = bufferSize / sizeof( hssize_t );
00324     hssize_t* buffer    = reinterpret_cast< hssize_t* >( dataBuffer );
00325     data_set.set_file_ids( rows, nudge, buffer_size, H5T_NATIVE_HSSIZE );
00326     hssize_t prev_end;
00327     bool have_prev_end = false;
00328     // If we're reading the first row of the table, then the
00329     // previous end is implicitly -1.
00330     if( !file_ids.empty() && file_ids.front() == start_file_id )
00331     {
00332         prev_end      = -1;
00333         have_prev_end = true;
00334     }
00335
00336     dbgOut.printf( 3, "Reading %s in %lu chunks\n", data_set.get_debug_desc(), data_set.get_read_count() );
00337
00338     // read offset table
00339     size_t count, offset;
00340     Range::const_iterator fiter = file_ids.begin();
00341     hint                        = offsets_out.begin();
00342     int nn                      = 0;
00343     while( !data_set.done() )
00344     {
00345         dbgOut.printf( 3, "Reading chunk %d of %s\n", ++nn, data_set.get_debug_desc() );
00346         try
00347         {
00348             data_set.read( buffer, count );
00349         }
00350         catch( ReadHDF5Dataset::Exception& )
00351         {
00352             return MB_FAILURE;
00353         }
00354         if( !count )  // might have been NULL read for collective IO
00355             continue;
00356
00357         // If the previous end values were read in the previous iteration,
00358         // then they're stored in prev_end.
00359         offset = 0;
00360         if( have_prev_end )
00361         {
00362             counts_out.push_back( buffer[0] - prev_end );
00363             hint = offsets_out.insert( hint, prev_end + 1 + nudge, buffer[0] + nudge );
00364             ++fiter;
00365             offset        = 1;
00366             have_prev_end = false;
00367         }
00368
00369         while( offset < count )
00370         {
00371             assert( fiter != file_ids.end() );
00372             // whenever we get to a gap between blocks we need to
00373             // advance one step because we read an extra end id
00374             // preceding teah block
00375             if( fiter == fiter.start_of_block() )
00376             {
00377                 if( offset == count - 1 ) break;
00378                 ++offset;
00379             }
00380
00381             size_t s = buffer[offset - 1] + 1;
00382             size_t e = buffer[offset];
00383             counts_out.push_back( e - s + 1 );
00384             hint = offsets_out.insert( hint, s + nudge, e + nudge );
00385
00386             ++fiter;
00387             ++offset;
00388         }
00389
00390         // If we did not end on the boundary between two blocks,
00391         // then we need to save the end indices for the final entry
00392         // for use in the next iteration.  Similarly, if we ended
00393         // with extra values that were read with the express intention
00394         // of getting the previous end values for a block, we need to
00395         // save them.  This case only arises if we hit the break in
00396         // the above loop.
00397         if( fiter != fiter.start_of_block() || offset < count )
00398         {
00399             assert( !have_prev_end );
00400             if( offset == count )
00401             {
00402                 --offset;
00403                 assert( fiter != fiter.start_of_block() );
00404             }
00405             else
00406             {
00407                 assert( offset + 1 == count );
00408                 assert( fiter == fiter.start_of_block() );
00409             }
00410             have_prev_end = true;
00411             prev_end      = buffer[offset];
00412         }
00413     }
00414     assert( !have_prev_end );
00415     assert( fiter == file_ids.end() );
00416
00417     return MB_SUCCESS;
00418 }
00419
00420 }  // namespace moab