FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
match.c
1 /*
2 Authors: Daniele Fognini, Andreas Wuerl, Marion Deveaud
3 Copyright (C) 2013-2015, Siemens AG
4 
5 This program is free software; you can redistribute it and/or
6 modify it under the terms of the GNU General Public License
7 version 2 as published by the Free Software Foundation.
8 
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13 
14 You should have received a copy of the GNU General Public License along
15 with this program; if not, write to the Free Software Foundation, Inc.,
16 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 */
18 
19 #include "match.h"
20 
21 #include "license.h"
22 #include "file_operations.h"
23 
24 static inline void doFindAllMatches(const File* file, const GArray* licenseArray,
25  guint tPos, guint sPos,
26  unsigned maxAllowedDiff, unsigned minAdjacentMatches,
27  GArray* matches) {
28  if (!licenseArray) {
29  /* we hope to get here very often */
30  return;
31  }
32 
33  for (guint i = 0; i < licenseArray->len; i++) {
34  License* license = license_index(licenseArray, i);
35  findDiffMatches(file, license, tPos, sPos, matches, maxAllowedDiff, minAdjacentMatches);
36  }
37 }
38 
39 GArray* findAllMatchesBetween(const File* file, const Licenses* licenses,
40  unsigned maxAllowedDiff, unsigned minAdjacentMatches, unsigned maxLeadingDiff) {
41  GArray* matches = g_array_new(FALSE, FALSE, sizeof(Match*));
42 
43  const GArray* textTokens = file->tokens;
44  const guint textLength = textTokens->len;
45 
46  for (guint tPos = 0; tPos < textLength; tPos++) {
47  for (guint sPos = 0; sPos <= maxLeadingDiff; sPos++) {
48  const GArray* availableLicenses = getLicenseArrayFor(licenses, sPos, textTokens, tPos);
49  doFindAllMatches(file, availableLicenses, tPos, sPos, maxAllowedDiff, minAdjacentMatches, matches);
50  }
51 
52  /* now search short licenses only fully (i.e. maxAllowedDiff = 0, minAdjacentMatches = 1) */
53  const GArray* shortLicenses = getShortLicenseArray(licenses);
54  doFindAllMatches(file, shortLicenses, tPos, 0, 0, 1, matches);
55  }
56 
57  return filterNonOverlappingMatches(matches);
58 }
59 
60 void match_array_free(GArray* matches) {
61 #if GLIB_CHECK_VERSION(2, 32, 0)
62  g_array_set_clear_func(matches, match_destroyNotify);
63 #else
64  for (unsigned int i=0; i< matches->len; ++i) {
65  Match* tmp = g_array_index(matches, Match*, i);
66  match_free(tmp);
67  }
68 #endif
69  g_array_free(matches, TRUE);
70 }
71 
72 int matchFileWithLicenses(MonkState* state, const File* file, const Licenses* licenses, const MatchCallbacks* callbacks) {
73  GArray* matches = findAllMatchesBetween(file, licenses,
74  MAX_ALLOWED_DIFF_LENGTH, MIN_ADJACENT_MATCHES, MAX_LEADING_DIFF);
75  int result = processMatches(state, file, matches, callbacks);
76 
77  // we are done: free memory
78  match_array_free(matches);
79 
80  return result;
81 }
82 
83 static char* getFileName(MonkState* state, long pFileId) {
84  char* pFile = queryPFileForFileId(state->dbManager, pFileId);
85 
86  if (!pFile) {
87  printf("file not found for pFileId=%ld\n", pFileId);
88  return NULL;
89  }
90  char* pFileName;
91 
92 #ifdef MONK_MULTI_THREAD
93 #pragma omp critical(getFileName)
94 #endif
95  {
96  pFileName = fo_RepMkPath("files", pFile);
97  }
98 
99  if (!pFileName) {
100  printf("file '%s' not found\n", pFile);
101  }
102 
103  free(pFile);
104 
105  return pFileName;
106 }
107 
108 int matchPFileWithLicenses(MonkState* state, long pFileId, const Licenses* licenses, const MatchCallbacks* callbacks) {
109  File file;
110  file.id = pFileId;
111  int result = 0;
112 
113  file.fileName = getFileName(state, pFileId);
114 
115  if (file.fileName != NULL) {
116  result = readTokensFromFile(file.fileName, &(file.tokens), DELIMITERS);
117 
118  if (result) {
119  result = matchFileWithLicenses(state, &file, licenses, callbacks);
120 
121  tokens_free(file.tokens);
122  }
123 
124  free(file.fileName);
125  }
126 
127  return result;
128 }
129 
130 char* formatMatchArray(GArray* matchInfo) {
131  GString* stringBuilder = g_string_new("");
132 
133  size_t len = matchInfo->len;
134  for (size_t i = 0; i < len; i++) {
135  DiffMatchInfo* current = &g_array_index(matchInfo, DiffMatchInfo, i);
136 
137  if (current->text.length > 0) {
138  g_string_append_printf(stringBuilder,
139  "t[%zu+%zu] %s ",
140  current->text.start, current->text.length, current->diffType);
141  }
142  else {
143  g_string_append_printf(stringBuilder,
144  "t[%zu] %s ",
145  current->text.start, current->diffType);
146  }
147 
148  if (current->search.length > 0) {
149  g_string_append_printf(stringBuilder,
150  "s[%zu+%zu]",
151  current->search.start, current->search.length);
152  }
153  else {
154  g_string_append_printf(stringBuilder,
155  "s[%zu]",
156  current->search.start);
157  }
158 
159  if (i < len - 1) {
160  g_string_append_printf(stringBuilder, ", ");
161  }
162  }
163 
164  return g_string_free(stringBuilder, FALSE);
165 }
166 
167 static unsigned short match_rank(Match* match) {
168  if (match->type == MATCH_TYPE_FULL) {
169  return 100;
170  }
171  else {
172  DiffResult* diffResult = match->ptr.diff;
173  const License* license = match->license;
174  unsigned int licenseLength = license->tokens->len;
175  size_t numberOfMatches = diffResult->matched;
176  size_t numberOfAdditions = diffResult->added;
177 
178  // calculate result percentage as jaccard index
179  double rank = (100.0 * numberOfMatches) / (licenseLength + numberOfAdditions);
180  int result = (int) rank;
181 
182  result = MIN(result, 99);
183  result = MAX(result, 1);
184 
185  diffResult->rank = rank;
186  diffResult->percentual = (unsigned short) result;
187 
188  return (unsigned short) result;
189  }
190 }
191 
192 static int match_isFull(const Match* match) {
193  return match->type == MATCH_TYPE_FULL;
194 }
195 
196 size_t match_getStart(const Match* match) {
197  if (match_isFull(match)) {
198  return match->ptr.full->start;
199  }
200  else {
201  GArray* matchedInfo = match->ptr.diff->matchedInfo;
202 
203  DiffPoint firstDiff = g_array_index(matchedInfo, DiffMatchInfo, 0).text;
204 
205  return firstDiff.start;
206  }
207 }
208 
209 size_t match_getEnd(const Match* match) {
210  if (match_isFull(match)) {
211  return match->ptr.full->length + match->ptr.full->start;
212  }
213  else {
214  GArray* matchedInfo = match->ptr.diff->matchedInfo;
215 
216  DiffPoint lastDiff = g_array_index(matchedInfo, DiffMatchInfo, matchedInfo->len - 1).text;
217 
218  return lastDiff.start + lastDiff.length;
219  }
220 }
221 
222 static int match_includes(const Match* big, const Match* small) {
223  return (match_getStart(big) <= match_getStart(small)) && (match_getEnd(big) >= match_getEnd(small));
224 }
225 
226 // make sure to call it after a match_rank or the result will be junk
227 double match_getRank(const Match* match) {
228  if (match_isFull(match)) {
229  return 100.0;
230  }
231  else {
232  return match->ptr.diff->rank;
233  }
234 }
235 
236 static int compareMatchByRank(const Match* matchA, const Match* matchB) {
237  double matchARank = match_getRank(matchA);
238  double matchBRank = match_getRank(matchB);
239 
240  if (matchARank > matchBRank) {
241  return 1;
242  }
243  if (matchARank < matchBRank) {
244  return -1;
245  }
246 
247  return 0;
248 }
249 
250 /* profiling says there is no need to cache the comparison */
251 static int licenseIncludes(const License* big, const License* small) {
252  const GArray* tokensBig = big->tokens;
253  const GArray* tokensSmall = small->tokens;
254 
255  const guint bigLen = tokensBig->len;
256  const guint smallLen = tokensSmall->len;
257 
258  if (smallLen == 0) {
259  return 1;
260  }
261 
262  if (smallLen > bigLen) {
263  return 0;
264  }
265 
266  for (guint i = 0; i < bigLen; i++) {
267  unsigned n = smallLen;
268  if (matchNTokens(tokensBig, i, bigLen, tokensSmall, 0, smallLen, n)) {
269  return 1;
270  }
271  }
272 
273  return 0;
274 }
275 
276 int licensesDiffer(const License *thisLicense, const License *otherLicense) {
277  return (thisLicense->refId != otherLicense->refId);
278 }
279 
280 /* N.B. this is only a partial order of matches
281  *
282  * =0 not comparable
283  * >0 thisMatch >= otherMatch
284  * <0 thisMatch < otherMatch
285  *
286  **/
287 int match_partialComparator(const Match* thisMatch, const Match* otherMatch) {
288  const int thisIncludesOther = match_includes(thisMatch, otherMatch);
289  const int otherIncludesThis = match_includes(otherMatch, thisMatch);
290  const License *thisLicense = thisMatch->license;
291  const License *otherLicense = otherMatch->license;
292 
293  //Verify if matches overlap
294  if (thisIncludesOther || otherIncludesThis) {
295  if (match_isFull(thisMatch) && thisIncludesOther) {
296  return 1;
297  }
298  if (match_isFull(otherMatch) && otherIncludesThis) {
299  return -1;
300  }
301 
302  //Verify if licenses overlap
303  if (licensesDiffer(thisLicense, otherLicense)) {
304  if (licenseIncludes(thisLicense, otherLicense)) {
305  return 1;
306  }
307  if (licenseIncludes(otherLicense, thisLicense)) {
308  return -1;
309  }
310  if (match_isFull(otherMatch) && thisIncludesOther) {
311  //a complete different license is included in this match
312  return 0;
313  }
314  }
315 
316  return (compareMatchByRank(thisMatch, otherMatch) >= 0) ? 1 : -1;
317  }
318  return 0;
319 }
320 
321 /*
322  * finds the maximal matches according to match_partialComparator
323  * destructively filter matches array: input array and discarded matches are automatically freed
324  **/
325 GArray* filterNonOverlappingMatches(GArray* matches) {
326  const guint len = matches->len;
327 
328  /* profiling says this is not time critical and worst case is O(n^2) with any algorithm */
329  /* instead of removing elements from the array set them to NULL and create a new array at the end */
330  for (guint i = 0; i < len; i++) {
331  Match* thisMatch = match_array_index(matches, i);
332  if (thisMatch == NULL) {
333  continue;
334  }
335 
336  for (guint j = i + 1; j < len; j++) {
337  Match* otherMatch = match_array_index(matches, j);
338  if (otherMatch == NULL) {
339  continue;
340  }
341 
342  gint comparison = match_partialComparator(thisMatch, otherMatch);
343 
344  if (comparison > 0) {
345  match_free(otherMatch);
346  match_array_index(matches, j) = NULL;
347  }
348  else if (comparison < 0) {
349  match_free(thisMatch);
350  match_array_index(matches, i) = NULL;
351  break;
352  }
353  }
354  }
355 
356  GArray* result = g_array_new(FALSE, FALSE, sizeof(Match*));
357  for (guint i = 0; i < len; i++) {
358  Match* thisMatch = match_array_index(matches, i);
359  if (thisMatch) {
360  g_array_append_val(result, thisMatch);
361  }
362  }
363 
364  g_array_free(matches, TRUE);
365 
366  return result;
367 }
368 
369 int processMatch(MonkState* state, const File* file, const Match* match, const MatchCallbacks* callbacks) {
370  const License* license = match->license;
371  if (match->type == MATCH_TYPE_DIFF) {
372  DiffResult* diffResult = match->ptr.diff;
373 
374  convertToAbsolutePositions(diffResult->matchedInfo, file->tokens, license->tokens);
375  return callbacks->onDiff(state, file, license, diffResult);
376  }
377  else {
378  DiffMatchInfo matchInfo;
379  matchInfo.text = getFullHighlightFor(file->tokens, match->ptr.full->start, match->ptr.full->length);
380  matchInfo.search = getFullHighlightFor(license->tokens, 0, license->tokens->len);
381  matchInfo.diffType = FULL_MATCH;
382 
383  return callbacks->onFull(state, file, license, &matchInfo);
384  }
385 }
386 
387 int processMatches(MonkState* state, const File* file, const GArray* matches, const MatchCallbacks* callbacks) {
388  if (callbacks->ignore && callbacks->ignore(state, file)) {
389  return 1;
390  }
391 
392  if (callbacks->onAll) {
393  return callbacks->onAll(state, file, matches);
394  }
395 
396  callbacks->onBeginOutput(state);
397 
398  const guint matchCount = matches->len;
399 
400  int result = 1;
401  if (matchCount == 0) {
402  result = callbacks->onNo(state, file);
403  }
404 
405  for (guint matchIndex = 0; result && (matchIndex < matchCount); matchIndex++) {
406  const Match* match = match_array_index(matches, matchIndex);
407  result &= processMatch(state, file, match, callbacks);
408  if (matchIndex != matchCount - 1) {
409  callbacks->onBetweenIndividualOutputs(state);
410  }
411  }
412 
413  callbacks->onEndOutput(state);
414 
415  return result;
416 }
417 
418 Match* diffResult2Match(DiffResult* diffResult, const License* license) {
419  Match* newMatch = malloc(sizeof(Match));
420  newMatch->license = license;
421 
422  /* it's full only if we have no diffs and the license was not truncated */
423  if (diffResult->matchedInfo->len == 1 && (diffResult->matched == license->tokens->len)) {
424  newMatch->type = MATCH_TYPE_FULL;
425  newMatch->ptr.full = malloc(sizeof(DiffPoint));
426  *(newMatch->ptr.full) = g_array_index(diffResult->matchedInfo, DiffMatchInfo, 0).text;
427  diffResult_free(diffResult);
428  }
429  else {
430  newMatch->type = MATCH_TYPE_DIFF;
431  newMatch->ptr.diff = diffResult;
432 
433  }
434  return newMatch;
435 }
436 
437 void findDiffMatches(const File* file, const License* license,
438  size_t textStartPosition, size_t searchStartPosition,
439  GArray* matches,
440  unsigned int maxAllowedDiff, unsigned int minAdjacentMatches) {
441 
442  if (!matchNTokens(file->tokens, textStartPosition, file->tokens->len,
443  license->tokens, searchStartPosition, license->tokens->len,
444  minAdjacentMatches)) {
445  return;
446  }
447 
448  DiffResult* diffResult = findMatchAsDiffs(file->tokens, license->tokens,
449  textStartPosition, searchStartPosition,
450  maxAllowedDiff, minAdjacentMatches);
451 
452  if (diffResult) {
453  Match* newMatch = diffResult2Match(diffResult, license);
454 
455  if (match_rank(newMatch) > MIN_ALLOWED_RANK)
456  g_array_append_val(matches, newMatch);
457  else {
458  match_free(newMatch);
459  }
460  }
461 }
462 
463 #if GLIB_CHECK_VERSION(2, 32, 0)
464 
465 void match_destroyNotify(gpointer matchP) {
466  match_free(*((Match**) matchP));
467 }
468 
469 #endif
470 
471 void match_free(Match* match) {
472  if (match->type == MATCH_TYPE_DIFF) {
473  diffResult_free(match->ptr.diff);
474  }
475  else {
476  free(match->ptr.full);
477  }
478  free(match);
479 }
Definition: monk.h:78
Definition: match.h:31
char * queryPFileForFileId(fo_dbManager *dbManager, long fileId)
Get the pfile name for a given file ID.
Definition: libfossagent.c:136
Definition: monk.h:55
Store the results of a regex match.
Definition: scanners.hpp:39
Definition: monk.h:72
Definition: nomos.h:439
Definition: monk.h:66
Definition: diff.h:25
void matchFileWithLicenses(const string &sContent, unsigned long pFileId, CopyrightState const &state, int agentId, CopyrightDatabaseHandler &databaseHandler)
Scan a given file with all available scanners and save findings to database.
#define MIN(a, b)
Min of two.
Definition: licenses.c:76
#define MAX(a, b)
Max of two.
Definition: licenses.c:75
char * fo_RepMkPath(const char *Type, char *Filename)
Given a filename, construct the full path to the file.
Definition: libfossrepo.c:364
void matchPFileWithLicenses(CopyrightState const &state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler &databaseHandler)
Get the file contents, scan for statements and save findings to database.