• Main Page
  • Related Pages
  • Modules
  • Data Structures
  • Files
  • File List
  • Globals

sst/elements/genericProc/programs/MTGL/mtgl/mtgl_string.h

Go to the documentation of this file.
00001 /*  _________________________________________________________________________
00002  *
00003  *  MTGL: The MultiThreaded Graph Library
00004  *  Copyright (c) 2008 Sandia Corporation.
00005  *  This software is distributed under the BSD License.
00006  *  Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
00007  *  the U.S. Government retains certain rights in this software.
00008  *  For more information, see the README file in the top MTGL directory.
00009  *  _________________________________________________________________________
00010  */
00011 
00012 /****************************************************************************/
00013 /*! \file mtgl_string.h
00014 
00015     \author Eric Goodman (elgoodm@sandia.gov)
00016     \author Brad Mancke (bmancke@bbn.com)
00017 
00018     \brief A file with utility functions operating on character arrays.
00019 
00020     \date April 7, 2010
00021 */
00022 /****************************************************************************/
00023 
00024 #ifndef MTGL_MTGL_STRING_H
00025 #define MTGL_MTGL_STRING_H
00026 
00027 #define NUM_STREAMS_TOKENIZE 60
00028 
00029 #include <cstdlib>
00030 #include <cstdio>
00031 #include <cstring>
00032 #include <cmath>
00033 
00034 #include <mtgl/util.hpp>
00035 
00036 namespace mtgl {
00037 
00038 inline bool matches(char* long_string, int long_len,
00039                     char* short_string, int short_len)
00040 {
00041   bool match = true;
00042   int i = 0;
00043 
00044   while (match && i < short_len && i < long_len)
00045   {
00046     if (long_string[i] != short_string[i]) match = false;
00047 
00048     i++;
00049   }
00050 
00051   return match;
00052 }
00053 
00054 char* remove_escapes(char* delimiter)
00055 {
00056 
00057   int len = strlen(delimiter);
00058   char temp[len + 1];
00059   int new_len = 0;
00060 
00061   for (int i = 0; i < len; i++)
00062   {
00063     if (i == 0)
00064     {
00065       if (delimiter[i] == '\"' || delimiter[i] != '\'') continue;
00066     }
00067 
00068     if (i + 1 < len)
00069     {
00070       if (delimiter[i] == '\\')
00071       {
00072         if ( delimiter[i + 1] == '<')
00073         {
00074           continue;
00075         }
00076         else if ( delimiter[i + 1] == '>')
00077         {
00078           continue;
00079         }
00080       }
00081     }
00082 
00083     temp[new_len] = delimiter[i];
00084     new_len++;
00085   }
00086 
00087   for (int i = 0; i < new_len; i++) delimiter[i] = temp[i];
00088 
00089   delimiter[new_len] = '\0';
00090 
00091 #ifdef DEBUG
00092   printf("delimiter %s\n", delimiter);
00093 #endif
00094 
00095   return delimiter;
00096 }
00097 
00098 /*! \brief The function finds all sequences of alphanumeric characters
00099             and tokenizes them.
00100 
00101     \param array The character array to be tokenized.
00102     \param num_chars The number of characters in the array.
00103     \param num_words This paramter is set to the number of tokenized words.
00104     \param est_num_words The word array is allocated before it is known how
00105                           many words exist.  This parameter allows the user to
00106                           provide the estimate.  Otherwise, the word array
00107                           is allocated to be the same size as the character
00108                           array.
00109 
00110     \returns Returns a 2-dimensional character array that contains the
00111             tokenized words.
00112 
00113     CAUTION: The function canabalizes the input array, inserting null
00114              terminators in place of the start of an instance of a delimiter.
00115 */
00116 char** mtgl_strtok(char* array, int num_chars, int& num_words,
00117                    int est_num_words = -1)
00118 {
00119   num_words = 0;
00120   if (est_num_words < 0) est_num_words = num_chars;
00121 
00122   // In allocating the word buffer, we over-estimate the number of words.
00123   char** words = (char**) malloc((est_num_words) * sizeof(char*));
00124 
00125   int i = 0;
00126   int num_streams = 1;
00127 
00128   // Tokenize the string.
00129   #pragma mta trace "Before tokenization for loop"
00130   #pragma mta use NUM_STREAMS_TOKENIZE streams
00131   #pragma mta for all streams i of num_streams
00132   {
00133     int beg, end;
00134     determine_beg_end(num_chars, num_streams, i, beg, end);
00135     int num_local_words = 0;
00136 
00137     // Special case processing for beginning.
00138     if (beg == 0)
00139     {
00140       if (isalnum(array[beg])) num_local_words++;
00141     }
00142 
00143     // First go through and see how many words there are.
00144     if (end != num_chars)
00145     {
00146       for (int j = beg; j < end; j++)
00147       {
00148         if (!isalnum(array[j]))
00149         {
00150           array[j] = '\0';
00151 
00152           if (isalnum(array[j + 1])) num_local_words++;
00153         }
00154       }
00155     }
00156     else
00157     {
00158       for (int j = beg; j < end - 1; j++)
00159       {
00160         if (!isalnum(array[j]))
00161         {
00162           array[j] = '\0';
00163 
00164           if (isalnum(array[j + 1])) num_local_words++;
00165         }
00166       }
00167     }
00168 
00169     // Claim this thread's portion of the word array.
00170     int word_index = mt_incr(num_words, num_local_words);
00171 
00172     // Go through again and add the words.
00173     if (end != num_chars)
00174     {
00175       for (int j = beg; j < end; j++)
00176       {
00177         if (array[j] == '\0')
00178         {
00179           if (isalnum(array[j + 1]))
00180           {
00181             words[word_index] = &array[j + 1];
00182             word_index++;
00183           }
00184         }
00185       }
00186     }
00187     else
00188     {
00189       for (int j = beg; j < end - 1; j++)
00190       {
00191         if (array[j] == '\0')
00192         {
00193           if (isalnum(array[j + 1]))
00194           {
00195             words[word_index] = &array[j + 1];
00196             word_index++;
00197           }
00198         }
00199       }
00200     }
00201 
00202     // Special case processing for beginning.
00203     if (beg == 0)
00204     {
00205       if (isalnum(array[beg]))
00206       {
00207         words[word_index] = &array[beg];
00208         word_index++;
00209       }
00210     }
00211 
00212   }
00213 
00214   array[num_chars - 1] = '\0';        // Might mess up the last word.
00215 
00216   return words;
00217 }
00218 
00219 /*! \brief Similar to the other mtgl_strtok function, but this determines
00220            the token boundaries according to a specified delimiter.
00221 
00222     \param array The character array to be tokenized.
00223     \param num_chars The number of characters in the array.
00224     \param num_words This paramter is set to the number of tokenized words.
00225     \param char* delimiter The string used to found boudaries between tokens.
00226     \param est_num_words The word array is allocated before it is known how
00227                           many words exist.  This parameter allows the user to
00228                           provide the estimate.  Otherwise, the word array
00229                           is allocated to be the same size as the character
00230                           array.
00231 
00232     \returns Returns a 2-dimensional character array that contains the
00233             tokenized words.
00234 
00235     Currently, regular expressions in the delimiter is not supported.
00236 
00237     CAUTION: The function canabalizes the input array, inserting null
00238              terminators in place of the start of an instance of a delimiter.
00239 */
00240 char** mtgl_strtok(char* array, int num_chars, int& num_words,
00241                    char* delimiter, int est_num_words = -1)
00242 {
00243 
00244   delimiter = remove_escapes(delimiter);
00245 
00246   num_words = 0;
00247   if (est_num_words < 0) est_num_words = num_chars;
00248 
00249   int delimiter_length = strlen(delimiter);
00250 
00251   int max_num_procs = 512;
00252   int max_num_streams = NUM_STREAMS_TOKENIZE * max_num_procs;
00253   int size_replicated = max_num_streams * delimiter_length;
00254 
00255   char* delimiter_replicated = (char*) malloc(sizeof(char) * size_replicated);
00256   for (int i = 0; i < size_replicated; i++)
00257   {
00258     delimiter_replicated[i] = delimiter[i % delimiter_length];
00259   }
00260 
00261   // In allocating the word buffer, we over-estimate the number of words.
00262   char** words = (char**) malloc((est_num_words) * sizeof(char*));
00263 
00264   int i = 0;
00265   int num_streams = 1;
00266 
00267   // Tokenize the string.
00268   #pragma mta trace "Before tokenization for loop"
00269   #pragma mta use NUM_STREAMS_TOKENIZE streams
00270   #pragma mta for all streams i of num_streams
00271   {
00272     int beg, end;
00273     determine_beg_end(num_chars, num_streams, i, beg, end);
00274 
00275     int num_local_words = 0;
00276 
00277     // Special case processing for beginning.
00278     if (beg == 0)
00279     {
00280       if (matches(&array[beg], num_chars,
00281                   &delimiter_replicated[i * delimiter_length],
00282                   delimiter_length))
00283       {
00284         num_local_words++;
00285       }
00286     }
00287 
00288     // First go through and see how many words there are.
00289     if (end != num_chars)
00290     {
00291       for (int j = beg; j < end; j++)
00292       {
00293         if (matches(&array[j], num_chars - j,
00294                     &delimiter_replicated[i * delimiter_length],
00295                     delimiter_length))
00296         {
00297           array[j] = '\0';
00298 
00299           if (isalnum(array[j + 1])) num_local_words++;
00300         }
00301       }
00302     }
00303     else
00304     {
00305       for (int j = beg; j < end - delimiter_length + 1; j++)
00306       {
00307         if (matches(&array[j], num_chars - j,
00308                     &delimiter_replicated[i * delimiter_length],
00309                     delimiter_length))
00310         {
00311           array[j] = '\0';
00312 
00313           if (isalnum(array[j + 1])) num_local_words++;
00314         }
00315       }
00316     }
00317 
00318     // Claim this thread's portion of the word array.
00319     int word_index = mt_incr(num_words, num_local_words);
00320 
00321     // Go through again and add the words.
00322     if (end != num_chars)
00323     {
00324       for (int j = beg; j < end; j++)
00325       {
00326         if (array[j] == '\0' &&
00327             matches(&array[j + 1], num_chars - (j + 1),
00328                     &delimiter_replicated[i * delimiter_length + 1],
00329                     delimiter_length - 1))
00330 
00331         {
00332           words[word_index] = &array[j + delimiter_length];
00333           word_index++;
00334         }
00335       }
00336     }
00337     else
00338     {
00339       for (int j = beg; j < end - delimiter_length; j++)
00340       {
00341         if (array[j] == '\0' &&
00342             matches(&array[j + 1], num_chars - (j + 1),
00343                     &delimiter_replicated[i * delimiter_length + 1],
00344                     delimiter_length - 1))
00345 
00346         {
00347           words[word_index] = &array[j + delimiter_length];
00348           word_index++;
00349         }
00350       }
00351     }
00352 
00353     // Special case processing for beginning.
00354     if (beg == 0)
00355     {
00356       if (matches(&array[beg], num_chars,
00357                   &delimiter_replicated[i * delimiter_length],
00358                   delimiter_length))
00359       {
00360         words[word_index] = &array[beg + delimiter_length];
00361         word_index++;
00362       }
00363     }
00364 
00365   }
00366 
00367   array[num_chars - 1] = '\0';        // Might mess up the last word.
00368 
00369   return words;
00370 }
00371 
00372 }
00373 
00374 #endif

Generated on Fri Oct 22 2010 11:02:23 for SST by  doxygen 1.7.1