FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
string_operations.c
1 /*
2 Author: Daniele Fognini, Andreas Wuerl
3 Copyright (C) 2013-2015, Siemens AG
4 
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 version 2 as published by the Free Software Foundation.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along
15 with this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
18 
19 #define _GNU_SOURCE
20 #include <stdio.h>
21 #include <string.h>
22 #include <stdlib.h>
23 #include <ctype.h>
24 #include <stdarg.h>
25 
26 #include "string_operations.h"
27 #include "hash.h"
28 #include "monk.h"
29 
30 #define MAX_TOKENS_ARRAY_SIZE 4194304
31 
32 unsigned splittingDelim(char a, const char* delimiters) {
33  if (a == '\0')
34  return 1;
35  const char * ptr = delimiters;
36  while (*ptr) {
37  if (*ptr == a)
38  return 1;
39  ptr++;
40  }
41 
42  return 0;
43 }
44 
45 unsigned specialDelim(const char* z){
46  char a, b;
47  a = *z;
48  b = *(z+1);
49  if( a=='/') {
50  if (b=='/' || b=='*')
51  return 2;
52  }
53  else if( a=='*') {
54  if (b=='/')
55  return 2;
56  return 1;
57  }
58  else if( a==':' && b==':') {
59  return 2;
60  }
61  return 0;
62 }
63 
64 static inline void initStateToken(Token* stateToken) {
65  stateToken->hashedContent = hash_init();
66  stateToken->length = 0;
67  stateToken->removedBefore = 0;
68 }
69 
70 static int isIgnoredToken(Token* token) {
71  Token remToken;
72 
73 #ifndef MONK_CASE_INSENSITIVE
74  remToken.hashedContent = hash("REM");
75 #else
76  remToken.hashedContent = hash("rem");
77 #endif
78  remToken.length = 3;
79  remToken.removedBefore = 0;
80 
81  return tokenEquals(token, &remToken);
82 }
83 
84 int streamTokenize(const char* inputChunk, size_t inputSize, const char* delimiters, GArray** output, Token** remainder) {
85  GArray* tokens = *output;
86  Token* stateToken;
87 
88  unsigned int initialTokenCount = tokens->len;
89 
90  if (!inputChunk) {
91  if ((stateToken = *remainder)) {
92  if ((stateToken->length > 0) && !isIgnoredToken(stateToken)) {
93  g_array_append_val(tokens, *stateToken);
94  }
95  free(stateToken);
96  }
97  *remainder = NULL;
98  return 0;
99  }
100 
101  if (!*remainder) {
102  //initialize state
103  stateToken = malloc(sizeof (Token));
104  *remainder = stateToken;
105  initStateToken(stateToken);
106  } else {
107  stateToken = *remainder;
108  }
109 
110  if (tokens->len >= MAX_TOKENS_ARRAY_SIZE) {
111  printf("WARNING: stream has more tokens than maximum allowed\n");
112  return -1;
113  }
114 
115  const char* ptr = inputChunk;
116 
117  size_t readBytes = 0;
118  while (readBytes < inputSize) {
119  unsigned delimLen = 0;
120  if (inputSize - readBytes >= 2) {
121  delimLen = specialDelim(ptr);
122  }
123  if (!delimLen) {
124  delimLen = splittingDelim(*ptr, delimiters);
125  }
126 
127  if (delimLen > 0) {
128  if (stateToken->length > 0) {
129  if (isIgnoredToken(stateToken)) {
130  stateToken->removedBefore += stateToken->length;
131  stateToken->length = 0;
132  stateToken->hashedContent = hash_init();
133  } else {
134  g_array_append_val(tokens, *stateToken);
135  initStateToken(stateToken);
136  }
137  }
138 
139  stateToken->removedBefore += delimLen;
140 
141  ptr += delimLen;
142  readBytes += delimLen;
143  } else {
144 #ifndef MONK_CASE_INSENSITIVE
145  const char* newCharPtr = ptr;
146 #else
147  char newChar = g_ascii_tolower(*ptr);
148  const char* newCharPtr = &newChar;
149 #endif
150  hash_add(newCharPtr, &(stateToken->hashedContent));
151 
152  stateToken->length++;
153 
154  ptr += 1;
155  readBytes += 1;
156  }
157  }
158 
159  return tokens->len - initialTokenCount;
160 }
161 
162 GArray* tokenize(const char* inputString, const char* delimiters) {
163  GArray* tokenArray = tokens_new();
164 
165  Token* remainder = NULL;
166 
167  size_t inputLength = strlen(inputString);
168 
169 #define CHUNKS 4096
170  size_t chunksCount = inputLength / CHUNKS;
171  for (size_t i = 0; i < chunksCount; i++) {
172  int addedTokens = streamTokenize(inputString + i * CHUNKS, CHUNKS, delimiters, &tokenArray, &remainder);
173  if (addedTokens < 0) {
174  printf("WARNING: can not complete tokenizing of '%.30s...'\n", inputString);
175  break;
176  }
177  }
178  streamTokenize(inputString + chunksCount * CHUNKS, MIN(CHUNKS, inputLength - chunksCount * CHUNKS),
179  delimiters, &tokenArray, &remainder);
180  streamTokenize(NULL, 0, NULL, &tokenArray, &remainder);
181 #undef CHUNKS
182 
183  return tokenArray;
184 }
185 
186 int tokensEquals(const GArray* a, const GArray* b) {
187  if (b->len != a->len)
188  return 0;
189 
190  for (size_t i = 0; i < a->len; i++) {
191  Token* aToken = tokens_index(a, i);
192  Token* bToken = tokens_index(b, i);
193 
194  if (!tokenEquals(aToken, bToken))
195  return 0;
196  }
197 
198  return 1;
199 }
200 
201 size_t token_position_of(size_t index, const GArray* tokens) {
202  size_t result = 0;
203  size_t previousLength = 0;
204 
205  size_t limit = MIN(index + 1, tokens->len);
206 
207  for (size_t i = 0; i < limit; i++) {
208  Token* token = tokens_index(tokens, i);
209  result += token->removedBefore + previousLength;
210  previousLength = token_length(*token);
211  }
212 
213  if (index == tokens->len) {
214  result += previousLength;
215  }
216 
217  if (index > tokens->len) {
218  result += previousLength;
219  printf("WARNING: requested calculation of token index after the END token\n");
220  }
221 
222  return result;
223 }
#define MIN(a, b)
Min of two.
Definition: licenses.c:76