26 #include "string_operations.h" 30 #define MAX_TOKENS_ARRAY_SIZE 4194304 32 unsigned splittingDelim(
char a,
const char* delimiters) {
35 const char * ptr = delimiters;
45 unsigned specialDelim(
const char* z){
58 else if( a==
':' && b==
':') {
64 static inline void initStateToken(
Token* stateToken) {
65 stateToken->hashedContent = hash_init();
66 stateToken->length = 0;
67 stateToken->removedBefore = 0;
70 static int isIgnoredToken(
Token* token) {
73 #ifndef MONK_CASE_INSENSITIVE 74 remToken.hashedContent = hash(
"REM");
76 remToken.hashedContent = hash(
"rem");
79 remToken.removedBefore = 0;
81 return tokenEquals(token, &remToken);
84 int streamTokenize(
const char* inputChunk,
size_t inputSize,
const char* delimiters, GArray** output,
Token** remainder) {
85 GArray* tokens = *output;
88 unsigned int initialTokenCount = tokens->len;
91 if ((stateToken = *remainder)) {
92 if ((stateToken->length > 0) && !isIgnoredToken(stateToken)) {
93 g_array_append_val(tokens, *stateToken);
103 stateToken = malloc(
sizeof (
Token));
104 *remainder = stateToken;
105 initStateToken(stateToken);
107 stateToken = *remainder;
110 if (tokens->len >= MAX_TOKENS_ARRAY_SIZE) {
111 printf(
"WARNING: stream has more tokens than maximum allowed\n");
115 const char* ptr = inputChunk;
117 size_t readBytes = 0;
118 while (readBytes < inputSize) {
119 unsigned delimLen = 0;
120 if (inputSize - readBytes >= 2) {
121 delimLen = specialDelim(ptr);
124 delimLen = splittingDelim(*ptr, delimiters);
128 if (stateToken->length > 0) {
129 if (isIgnoredToken(stateToken)) {
130 stateToken->removedBefore += stateToken->length;
131 stateToken->length = 0;
132 stateToken->hashedContent = hash_init();
134 g_array_append_val(tokens, *stateToken);
135 initStateToken(stateToken);
139 stateToken->removedBefore += delimLen;
142 readBytes += delimLen;
144 #ifndef MONK_CASE_INSENSITIVE 145 const char* newCharPtr = ptr;
147 char newChar = g_ascii_tolower(*ptr);
148 const char* newCharPtr = &newChar;
150 hash_add(newCharPtr, &(stateToken->hashedContent));
152 stateToken->length++;
159 return tokens->len - initialTokenCount;
162 GArray* tokenize(
const char* inputString,
const char* delimiters) {
163 GArray* tokenArray = tokens_new();
165 Token* remainder = NULL;
167 size_t inputLength = strlen(inputString);
170 size_t chunksCount = inputLength / CHUNKS;
171 for (
size_t i = 0; i < chunksCount; i++) {
172 int addedTokens = streamTokenize(inputString + i * CHUNKS, CHUNKS, delimiters, &tokenArray, &remainder);
173 if (addedTokens < 0) {
174 printf(
"WARNING: can not complete tokenizing of '%.30s...'\n", inputString);
178 streamTokenize(inputString + chunksCount * CHUNKS,
MIN(CHUNKS, inputLength - chunksCount * CHUNKS),
179 delimiters, &tokenArray, &remainder);
180 streamTokenize(NULL, 0, NULL, &tokenArray, &remainder);
186 int tokensEquals(
const GArray* a,
const GArray* b) {
187 if (b->len != a->len)
190 for (
size_t i = 0; i < a->len; i++) {
191 Token* aToken = tokens_index(a, i);
192 Token* bToken = tokens_index(b, i);
194 if (!tokenEquals(aToken, bToken))
201 size_t token_position_of(
size_t index,
const GArray* tokens) {
203 size_t previousLength = 0;
205 size_t limit =
MIN(index + 1, tokens->len);
207 for (
size_t i = 0; i < limit; i++) {
208 Token* token = tokens_index(tokens, i);
209 result += token->removedBefore + previousLength;
210 previousLength = token_length(*token);
213 if (index == tokens->len) {
214 result += previousLength;
217 if (index > tokens->len) {
218 result += previousLength;
219 printf(
"WARNING: requested calculation of token index after the END token\n");
#define MIN(a, b)
Min of two.