FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
file_operations.c
1 /*
2 Author: Daniele Fognini, Andreas Wuerl
3 Copyright (C) 2013-2014, Siemens AG
4 
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 version 2 as published by the Free Software Foundation.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along
15 with this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
18 
19 #include "file_operations.h"
20 #include <sys/stat.h>
21 #include <unistd.h>
22 #include <stdio.h>
23 #include <stdlib.h>
24 
25 #include <fcntl.h>
26 
27 #include "hash.h"
28 #include "string_operations.h"
29 #include "encoding.h"
30 
31 #define BUFFSIZE 4096
32 
33 int readTokensFromFile(const char* fileName, GArray** tokens, const char* delimiters)
34 {
35  int fd = open(fileName, O_RDONLY);
36  if (fd < 0)
37  {
38  printf("FATAL: can not open %s\n", fileName);
39  return 0;
40  }
41 
42  *tokens = tokens_new();
43 
44  int needConverter = 1;
45  iconv_t converter = NULL;
46 
47  Token* remainder = NULL;
48 
49  char buffer[BUFFSIZE];
50  char convertedBuffer[BUFFSIZE];
51 
52  ssize_t n;
53  size_t leftFromLast = 0;
54  while ((n = read(fd, buffer + leftFromLast, sizeof(buffer) - leftFromLast)) > 0)
55  {
56  size_t len = (size_t) n + leftFromLast;
57  char* chunk = buffer;
58  leftFromLast = 0;
59 
60  if (needConverter)
61  {
62  needConverter = 0;
63  converter = guessConverter(buffer, len);
64  }
65 
66  if (converter)
67  {
68  char* input = buffer;
69  size_t inputLeft = len;
70 
71  char* output = convertedBuffer;
72  size_t outputLength = sizeof(convertedBuffer);
73  iconv(converter, &input, &inputLeft, &output, &outputLength);
74 
75  if (outputLength != sizeof(convertedBuffer)) {
76  chunk = convertedBuffer;
77  len = sizeof(convertedBuffer) - outputLength;
78 
79  leftFromLast = inputLeft;
80  for (size_t i = 0; i < leftFromLast; i++)
81  {
82  buffer[i] = *input++;
83  }
84  } else {
85  // the raw buffer is full and we could not write to the converted buffer
86  printf("WARNING: cannot re-encode '%s', going binary from now on\n", fileName);
87  iconv_close(converter);
88  converter = NULL;
89  }
90  }
91 
92  /* N.B. this tokenizes inside the re-encoded buffer:
93  * the offsets found are byte positions in the UTF-8 stream, not file positions
94  **/
95  int addedTokens = streamTokenize(chunk, len, delimiters, tokens, &remainder);
96  if (addedTokens < 0)
97  {
98  printf("WARNING: can not complete tokenizing of '%s'\n", fileName);
99  break;
100  }
101  }
102 
103  streamTokenize(buffer, leftFromLast, delimiters, tokens, &remainder);
104  streamTokenize(NULL, 0, NULL, tokens, &remainder);
105 
106  close(fd);
107 
108  if (converter)
109  {
110  iconv_close(converter);
111  }
112 
113  return 1;
114 }
char buffer[2048]
The last thing received from the scheduler.