22 #include "file_operations.h" 24 static inline void doFindAllMatches(
const File* file,
const GArray* licenseArray,
25 guint tPos, guint sPos,
26 unsigned maxAllowedDiff,
unsigned minAdjacentMatches,
33 for (guint i = 0; i < licenseArray->len; i++) {
35 findDiffMatches(file, license, tPos, sPos, matches, maxAllowedDiff, minAdjacentMatches);
39 GArray* findAllMatchesBetween(
const File* file,
const Licenses* licenses,
40 unsigned maxAllowedDiff,
unsigned minAdjacentMatches,
unsigned maxLeadingDiff) {
41 GArray* matches = g_array_new(FALSE, FALSE,
sizeof(
Match*));
43 const GArray* textTokens = file->tokens;
44 const guint textLength = textTokens->len;
46 for (guint tPos = 0; tPos < textLength; tPos++) {
47 for (guint sPos = 0; sPos <= maxLeadingDiff; sPos++) {
48 const GArray* availableLicenses = getLicenseArrayFor(licenses, sPos, textTokens, tPos);
49 doFindAllMatches(file, availableLicenses, tPos, sPos, maxAllowedDiff, minAdjacentMatches, matches);
53 const GArray* shortLicenses = getShortLicenseArray(licenses);
54 doFindAllMatches(file, shortLicenses, tPos, 0, 0, 1, matches);
57 return filterNonOverlappingMatches(matches);
60 void match_array_free(GArray* matches) {
61 #if GLIB_CHECK_VERSION(2, 32, 0) 62 g_array_set_clear_func(matches, match_destroyNotify);
64 for (
unsigned int i=0; i< matches->len; ++i) {
65 Match* tmp = g_array_index(matches,
Match*, i);
69 g_array_free(matches, TRUE);
73 GArray* matches = findAllMatchesBetween(file, licenses,
74 MAX_ALLOWED_DIFF_LENGTH, MIN_ADJACENT_MATCHES, MAX_LEADING_DIFF);
75 int result = processMatches(state, file, matches, callbacks);
78 match_array_free(matches);
83 static char* getFileName(
MonkState* state,
long pFileId) {
87 printf(
"file not found for pFileId=%ld\n", pFileId);
92 #ifdef MONK_MULTI_THREAD 93 #pragma omp critical(getFileName) 100 printf(
"file '%s' not found\n", pFile);
113 file.fileName = getFileName(state, pFileId);
115 if (file.fileName != NULL) {
116 result = readTokensFromFile(file.fileName, &(file.tokens), DELIMITERS);
121 tokens_free(file.tokens);
130 char* formatMatchArray(GArray* matchInfo) {
131 GString* stringBuilder = g_string_new(
"");
133 size_t len = matchInfo->len;
134 for (
size_t i = 0; i < len; i++) {
137 if (current->text.length > 0) {
138 g_string_append_printf(stringBuilder,
140 current->text.start, current->text.length, current->diffType);
143 g_string_append_printf(stringBuilder,
145 current->text.start, current->diffType);
148 if (current->search.length > 0) {
149 g_string_append_printf(stringBuilder,
151 current->search.start, current->search.length);
154 g_string_append_printf(stringBuilder,
156 current->search.start);
160 g_string_append_printf(stringBuilder,
", ");
164 return g_string_free(stringBuilder, FALSE);
167 static unsigned short match_rank(
Match*
match) {
168 if (match->type == MATCH_TYPE_FULL) {
174 unsigned int licenseLength = license->tokens->len;
175 size_t numberOfMatches = diffResult->matched;
176 size_t numberOfAdditions = diffResult->added;
179 double rank = (100.0 * numberOfMatches) / (licenseLength + numberOfAdditions);
180 int result = (int) rank;
182 result =
MIN(result, 99);
183 result =
MAX(result, 1);
185 diffResult->rank = rank;
186 diffResult->percentual = (
unsigned short) result;
188 return (
unsigned short) result;
192 static int match_isFull(
const Match* match) {
193 return match->type == MATCH_TYPE_FULL;
196 size_t match_getStart(
const Match* match) {
197 if (match_isFull(match)) {
198 return match->ptr.full->start;
201 GArray* matchedInfo = match->ptr.diff->matchedInfo;
205 return firstDiff.start;
209 size_t match_getEnd(
const Match* match) {
210 if (match_isFull(match)) {
211 return match->ptr.full->length + match->ptr.full->start;
214 GArray* matchedInfo = match->ptr.diff->matchedInfo;
218 return lastDiff.start + lastDiff.length;
222 static int match_includes(
const Match* big,
const Match* small) {
223 return (match_getStart(big) <= match_getStart(small)) && (match_getEnd(big) >= match_getEnd(small));
227 double match_getRank(
const Match* match) {
228 if (match_isFull(match)) {
232 return match->ptr.diff->rank;
236 static int compareMatchByRank(
const Match* matchA,
const Match* matchB) {
237 double matchARank = match_getRank(matchA);
238 double matchBRank = match_getRank(matchB);
240 if (matchARank > matchBRank) {
243 if (matchARank < matchBRank) {
251 static int licenseIncludes(
const License* big,
const License* small) {
252 const GArray* tokensBig = big->tokens;
253 const GArray* tokensSmall = small->tokens;
255 const guint bigLen = tokensBig->len;
256 const guint smallLen = tokensSmall->len;
262 if (smallLen > bigLen) {
266 for (guint i = 0; i < bigLen; i++) {
267 unsigned n = smallLen;
268 if (matchNTokens(tokensBig, i, bigLen, tokensSmall, 0, smallLen, n)) {
276 int licensesDiffer(
const License *thisLicense,
const License *otherLicense) {
277 return (thisLicense->refId != otherLicense->refId);
287 int match_partialComparator(
const Match* thisMatch,
const Match* otherMatch) {
288 const int thisIncludesOther = match_includes(thisMatch, otherMatch);
289 const int otherIncludesThis = match_includes(otherMatch, thisMatch);
290 const License *thisLicense = thisMatch->license;
291 const License *otherLicense = otherMatch->license;
294 if (thisIncludesOther || otherIncludesThis) {
295 if (match_isFull(thisMatch) && thisIncludesOther) {
298 if (match_isFull(otherMatch) && otherIncludesThis) {
303 if (licensesDiffer(thisLicense, otherLicense)) {
304 if (licenseIncludes(thisLicense, otherLicense)) {
307 if (licenseIncludes(otherLicense, thisLicense)) {
310 if (match_isFull(otherMatch) && thisIncludesOther) {
316 return (compareMatchByRank(thisMatch, otherMatch) >= 0) ? 1 : -1;
325 GArray* filterNonOverlappingMatches(GArray* matches) {
326 const guint len = matches->len;
330 for (guint i = 0; i < len; i++) {
331 Match* thisMatch = match_array_index(matches, i);
332 if (thisMatch == NULL) {
336 for (guint j = i + 1; j < len; j++) {
337 Match* otherMatch = match_array_index(matches, j);
338 if (otherMatch == NULL) {
342 gint comparison = match_partialComparator(thisMatch, otherMatch);
344 if (comparison > 0) {
345 match_free(otherMatch);
346 match_array_index(matches, j) = NULL;
348 else if (comparison < 0) {
349 match_free(thisMatch);
350 match_array_index(matches, i) = NULL;
356 GArray* result = g_array_new(FALSE, FALSE,
sizeof(
Match*));
357 for (guint i = 0; i < len; i++) {
358 Match* thisMatch = match_array_index(matches, i);
360 g_array_append_val(result, thisMatch);
364 g_array_free(matches, TRUE);
370 const License* license = match->license;
371 if (match->type == MATCH_TYPE_DIFF) {
374 convertToAbsolutePositions(diffResult->matchedInfo, file->tokens, license->tokens);
375 return callbacks->onDiff(state, file, license, diffResult);
379 matchInfo.text = getFullHighlightFor(file->tokens, match->ptr.full->start, match->ptr.full->length);
380 matchInfo.search = getFullHighlightFor(license->tokens, 0, license->tokens->len);
381 matchInfo.diffType = FULL_MATCH;
383 return callbacks->onFull(state, file, license, &matchInfo);
388 if (callbacks->ignore && callbacks->ignore(state, file)) {
392 if (callbacks->onAll) {
393 return callbacks->onAll(state, file, matches);
396 callbacks->onBeginOutput(state);
398 const guint matchCount = matches->len;
401 if (matchCount == 0) {
402 result = callbacks->onNo(state, file);
405 for (guint matchIndex = 0; result && (matchIndex < matchCount); matchIndex++) {
406 const Match* match = match_array_index(matches, matchIndex);
407 result &= processMatch(state, file, match, callbacks);
408 if (matchIndex != matchCount - 1) {
409 callbacks->onBetweenIndividualOutputs(state);
413 callbacks->onEndOutput(state);
420 newMatch->license = license;
423 if (diffResult->matchedInfo->len == 1 && (diffResult->matched == license->tokens->len)) {
424 newMatch->type = MATCH_TYPE_FULL;
425 newMatch->ptr.full = malloc(
sizeof(
DiffPoint));
426 *(newMatch->ptr.full) = g_array_index(diffResult->matchedInfo,
DiffMatchInfo, 0).text;
427 diffResult_free(diffResult);
430 newMatch->type = MATCH_TYPE_DIFF;
431 newMatch->ptr.diff = diffResult;
437 void findDiffMatches(
const File* file,
const License* license,
438 size_t textStartPosition,
size_t searchStartPosition,
440 unsigned int maxAllowedDiff,
unsigned int minAdjacentMatches) {
442 if (!matchNTokens(file->tokens, textStartPosition, file->tokens->len,
443 license->tokens, searchStartPosition, license->tokens->len,
444 minAdjacentMatches)) {
448 DiffResult* diffResult = findMatchAsDiffs(file->tokens, license->tokens,
449 textStartPosition, searchStartPosition,
450 maxAllowedDiff, minAdjacentMatches);
453 Match* newMatch = diffResult2Match(diffResult, license);
455 if (match_rank(newMatch) > MIN_ALLOWED_RANK)
456 g_array_append_val(matches, newMatch);
458 match_free(newMatch);
463 #if GLIB_CHECK_VERSION(2, 32, 0) 465 void match_destroyNotify(gpointer matchP) {
466 match_free(*((
Match**) matchP));
471 void match_free(
Match* match) {
472 if (match->type == MATCH_TYPE_DIFF) {
473 diffResult_free(match->ptr.diff);
476 free(match->ptr.full);
char * queryPFileForFileId(fo_dbManager *dbManager, long fileId)
Get the pfile name for a given file ID.
Store the results of a regex match.
void matchFileWithLicenses(const string &sContent, unsigned long pFileId, CopyrightState const &state, int agentId, CopyrightDatabaseHandler &databaseHandler)
Scan a given file with all available scanners and save findings to database.
#define MIN(a, b)
Min of two.
#define MAX(a, b)
Max of two.
char * fo_RepMkPath(const char *Type, char *Filename)
Given a filename, construct the full path to the file.
void matchPFileWithLicenses(CopyrightState const &state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler &databaseHandler)
Get the file contents, scan for statements and save findings to database.