FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
regexscan.c
Go to the documentation of this file.
1 /***************************************************************
2  regexscan: Scan file(s) for regular expression(s)
3 
4  Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
5 
6  This program is free software; you can redistribute it and/or
7  modify it under the terms of the GNU General Public License
8  version 2 as published by the Free Software Foundation.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License along
16  with this program; if not, write to the Free Software Foundation, Inc.,
17  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  ***************************************************************/
54 #include <stdlib.h>
55 #include <stdio.h>
56 #include <unistd.h>
57 #include <string.h>
58 #include <ctype.h>
59 #include <signal.h>
60 #include <libgen.h>
61 
62 #include <regex.h>
63 #include <stdbool.h>
64 
65 #include "libfossology.h"
66 
67 #define MAXCMD 4096
68 char SQL[256];
69 
70 #define myBUFSIZ 2048
71 
72 /*
73 #ifdef COMMIT_HASH
74 char BuildVersion[]="Build version: " COMMIT_HASH ".\n";
75 #endif
76 */
77 
78 PGconn *pgConn = NULL;
79 
90 int regexScan(regex_t *regex, char *regexStr, FILE *scanFilePtr, char *fileName)
91 {
92  int retCode;
93 
94 // regex_t regex;
95  bool match = false; /* regex match found indicator */
96 
97  char msgBuff[2500];
98  char textBuff[2000]; /* line buffer for regex match processing */
99 
100  regmatch_t rm[1];
101  int lineCount = 0;
102 
103  /* Now scan the file for regex line by line */
104  while (fgets(textBuff, 1024, scanFilePtr) != NULL)
105  {
106  lineCount++; /* Another line read */
107  retCode = regexec(regex, textBuff, 1, rm, 0); /* nmatch = 1, matchptr = rm */
108  if (!retCode)
109  {
110  sprintf(msgBuff, "%s: regex found at line %d at position %d. -> %.*s \n",
111  fileName, lineCount, rm[0].rm_so+1, rm[0].rm_eo-rm[0].rm_so, textBuff + rm[0].rm_so);
112  puts(msgBuff);
113  if (!match)
114  {
115  match = true; /* Indicate we've had at least one match */
116  }
117  }
118  else if (retCode == REG_NOMATCH)
119  {
120  /* Skip the "no match" retCode */
121  }
122  else
123  {
124  regerror(retCode, regex, msgBuff, sizeof(msgBuff));
125  fprintf(stderr, "Out of memory? - regex match failure: %s\n", msgBuff);
126  fclose(scanFilePtr);
127  return 3;
128  }
129  }
130 
131  /* Report if no matches found */
132  if (!match)
133  {
134  sprintf(msgBuff, "%s: %s not found\n", fileName, regexStr);
135  puts(msgBuff);
136  }
137 
138  /* clean up and exit */
139 // regfree(&regex);
140  fclose(scanFilePtr);
141  return 0;
142 }
143 
152 int pfileNumToNames(char *pfileNum, char *pfileRepoName, char *pfileRealName)
153 {
154  char sqlSelect[256];
155  PGresult *result;
156 
157  /* Attempt to locate the appropriate pFile_pk record */
158  sprintf(sqlSelect, "SELECT pfile_sha1, pfile_md5, pfile_size, ufile_name FROM"
159  " pfile, uploadtree WHERE pfile_fk = pfile_pk and pfile_pk = '%s'", pfileNum);
160  result = PQexec(pgConn, sqlSelect);
161 
162  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
163 
164  /* confirm a sane result set */
165  if (PQntuples(result) == 0)
166  {
167  PQclear(result);
168 
169  /* Not found */
170  fprintf(stderr, "Database does not contain pfile_pk: %s\n", pfileNum);
171  return 1;
172  }
173  else if (PQntuples(result) != 1)
174  {
175  PQclear(result);
176 
177  /* Not found */
178  fprintf(stderr, "Database contains multiple pfile_pk: %s\n", pfileNum);
179  return 2;
180  }
181  /* We've managed to locate the one and only pfile_pk record. Build the filePath string */
182  /* Concatenate first row fields 0, 1 and 2 */
183  sprintf(pfileRepoName, "%s.%s.%s", PQgetvalue(result, 0, 0), PQgetvalue(result, 0, 1), PQgetvalue(result, 0, 2));
184  /* and extract the actual filename from field 4 - uploadtree.ufile_name */
185  sprintf(pfileRealName, "%s", PQgetvalue(result, 0, 3));
186 
187 // fprintf(stderr, "fileName is:%s\n", pFileName);
188  PQclear(result);
189  return 0;
190 }
191 
192 
201 int regexScanUpload(char *uploadNum, char *regexStr)
202 {
203  char sqlSelect[256];
204  PGresult *result, *pfileResult;
205 
206  int fileCount, i, retCode;
207 
208  char fileRealName[1000];
209  char fileRepoName[1000];
210 
211  FILE *scanFilePtr;
212 
213  regex_t regex;
214 
215  /* Ensure uploadNum is "valid" then obtain a list of pfile entries and scan them */
216  sprintf(sqlSelect, "SELECT upload_pk, upload_mode, upload_filename from upload where upload_pk = '%s'", uploadNum);
217  result = PQexec(pgConn, sqlSelect);
218 
219  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
220 
221  /* confirm a sane result set */
222  if (PQntuples(result) == 0)
223  {
224  fprintf(stderr, "No uploads appear to be available here!\n");
225  PQclear(result);
226  return 0; /* nothing found to scan */
227  }
228 
229  /* Next ensure that uploadNum was successfully uploaded */
230  /* We'll only look at upload_pk entries that have successfully run ununpack (64) and adj2nest (32) */
231  if ((atoi(PQgetvalue(result, 0, 1)) & 96) != 96)
232  {
233  fprintf(stderr, "Upload %s was not successfully processed after upload!\n", uploadNum);
234  PQclear(result);
235  return 0; /* nothing found to scan */
236  }
237 
238  /* Now get our list of required pfile entries for this upload */
239  sprintf(sqlSelect, "SELECT uploadtree.pfile_fk, ufile_name from uploadtree, upload"
240  " where upload_fk = upload_pk and uploadtree.pfile_fk <> 0 and ufile_mode = 32768 and upload_pk = '%s'", uploadNum);
241  result = PQexec(pgConn, sqlSelect);
242 
243  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
244 
245  fileCount = PQntuples(result);
246 // fprintf(stderr, "Located %d files to process.\n", fileCount);
247 
248  /* Compile the regex for improved performance */
249  retCode = regcomp(&regex, regexStr, REG_ICASE+REG_EXTENDED);
250  if (retCode)
251  {
252  fprintf(stderr, "regex %s failed to compile\n", regexStr);
253  return 1;
254  }
255 
256  /* Scan the files we've found for this upload */
257  for (i=0; i<fileCount; i++)
258  {
259  /* Attempt to locate the appropriate pFile_pk record */
260  sprintf(sqlSelect, "SELECT pfile_sha1, pfile_md5, pfile_size, ufile_name"
261  " FROM pfile, uploadtree WHERE pfile_fk = pfile_pk and pfile_pk = '%s'", PQgetvalue(result, i, 0));
262  pfileResult = PQexec(pgConn, sqlSelect);
263 
264  if (fo_checkPQresult(pgConn, pfileResult, sqlSelect, __FILE__, __LINE__)) return 0;
265 
266  /* confirm a sane result set */
267  if (PQntuples(pfileResult) == 1)
268  {
269  /* For each pfile value grind through the regex scan process */
270 
271  /* Locate and construct the appropriate full name from pfile table based upon pfile_pk value */
272  if (pfileNumToNames(PQgetvalue(result, i, 0), fileRepoName, fileRealName) != 0)
273  {
274  fprintf(stderr, "ERROR: Unable to locate pfile_pk '%s'\n", PQgetvalue(result, i, 0));
275  return 0;
276  }
277 
278  /* Use fo_RepFread() for access. It uses fo_RepMkPath() to map name to full path. */
279  scanFilePtr = fo_RepFread("files", fileRepoName);
280  if (!scanFilePtr)
281  {
282  fprintf(stderr, "ERROR: Unable to open '%s/%s'\n", "files", fileRepoName);
283  return 0;
284  }
285 
286  /* Call scan function. Note that we'll need to "Humanize" the fileName at some point. */
287  regexScan(&regex, regexStr, scanFilePtr, fileRealName);
288  }
289  else
290  {
291  fprintf(stderr, "WARNING: File: %s - Located %d instances of pfile_pk %s ! Size = %s bytes!\n",
292  PQgetvalue(result, i, 1), PQntuples(pfileResult), PQgetvalue(result, i, 0), PQgetvalue(pfileResult, i, 2));
293  }
294  }
295  /* return the number of scanned files */
296  return i;
297 }
298 
299 
305 void Usage (char *Name)
306 {
307  printf("Usage: %s [options] [id [id ...]]\n",Name);
308  printf(" -i :: initialize the database, then exit.\n");
309  printf(" -c SYSCONFDIR :: FOSSology configuration directory.\n");
310  printf(" -h :: show available command line options.\n");
311  printf(" -v :: increase agent logging verbosity.\n");
312  printf(" -r :: regex expression to load from command line.\n");
313  printf(" filename :: filename to process with regex.\n");
314 } /* Usage() */
315 
316 /*********************************************************/
317 int main (int argc, char *argv[])
318 {
319  int nonoptargs;
320  int c, retCode;
321 
322  regex_t regex;
323 
324  char regexStr[1024]; /* string storage for the regex expression */
325  bool regexSet = false;
326 
327  char fileName[1000];
328  FILE *scanFilePtr;
329 
330  char uploadNum[10];
331 
332  int scannedCount = 0;
333 
334  int user_pk;
335  long UploadPK=-1;
336 
337  char *COMMIT_HASH;
338  char *VERSION;
339  char agent_rev[myBUFSIZ];
340 
341  /* connect to scheduler. Noop if not run from scheduler. */
342  fo_scheduler_connect(&argc, argv, &pgConn);
343 
344 /*
345  Version reporting.
346 */
347  COMMIT_HASH = fo_sysconfig("regexscan", "COMMIT_HASH");
348  VERSION = fo_sysconfig("regexscan", "VERSION");
349  sprintf(agent_rev, "%s.%s", VERSION, COMMIT_HASH);
350 #ifdef REGEX_DEBUG
351  fprintf(stdout, "regexscan reports version info as '%s.%s'.\n", VERSION, COMMIT_HASH);
352 #endif
353 
354  /* Process command-line */
355  while((c = getopt(argc,argv,"chir:v")) != -1)
356  {
357  switch(c)
358  {
359  case 'c':
360  break; /* handled by fo_scheduler_connect() */
361  case 'i':
362  PQfinish(pgConn);
363  return(0);
364  case 'r':
365  sprintf(regexStr, "%s", optarg);
366  regexSet = true;
367  break;
368  case 'v':
369  agent_verbose++;
370  break;
371  case 'h':
372  default:
373  Usage(argv[0]);
374  fflush(stdout);
375  PQfinish(pgConn);
376  exit(-1);
377  }
378  }
379 
380  /* Sanity check for regex value required here. */
381  if (!regexSet)
382  {
383  fprintf (stderr, "No regex value has been requested!\n");
384  PQfinish(pgConn);
386  return 1;
387  }
388 
389  /* process filename after switches. How many non-option arguments are there ? */
390  nonoptargs = argc - optind; /* total argument count minus the option count */
391 
392  if (nonoptargs == 0)
393  {
394  /* Assume it was a scheduler call */
395  user_pk = fo_scheduler_userID();
396 
397  while(fo_scheduler_next())
398  {
399  UploadPK = atol(fo_scheduler_current());
400 
401  printf("UploadPK is: %ld\n", UploadPK);
402  sprintf(uploadNum, "%ld", UploadPK);
403  scannedCount = regexScanUpload(uploadNum, regexStr);
404  if (scannedCount == 0)
405  {
406  fprintf(stderr, "Failed to successfully scan: upload - %s!\n", uploadNum);
407  }
408  }
409  }
410  else
411  {
412  /* File access initialization - For Stage 3 use first arg as fileName */
413  sprintf(fileName, "%s", argv[optind]); /* Grab first non-switch argument as filename */
414 
415  scanFilePtr = fopen(fileName, "r");
416  if (!scanFilePtr)
417  {
418  fprintf(stderr, "ERROR: Unable to open '%s'\n", fileName);
419  PQfinish(pgConn);
421  }
422 
423  /* Compile the regex for improved performance */
424  retCode = regcomp(&regex, regexStr, REG_ICASE+REG_EXTENDED);
425  if (retCode)
426  {
427  fprintf(stderr, "regex %s failed to compile\n", regexStr);
428  PQfinish(pgConn);
430  }
431 
432  /* Now call the function that scans a file for a regex */
433  retCode = regexScan(&regex, (char *)regexStr, scanFilePtr, (char *)fileName);
434 // retCode = regexScan(uploadNum, regexStr);
435  if (retCode != 0)
436  {
437  fprintf(stderr, "Failed to successfully scan: %s!\n", fileName);
438  }
439 
440  }
441 
442  PQfinish(pgConn);
444 
445  return 0;
446 } /* main() */
447 
int fo_checkPQresult(PGconn *pgConn, PGresult *result, char *sql, char *FileID, int LineNumb)
Check the result status of a postgres SELECT.
Definition: libfossdb.c:181
Store the results of a regex match.
Definition: scanners.hpp:39
int regexScanUpload(char *uploadNum, char *regexStr)
Scan an Upload for a regex - regular expression. gets a list of files in an upload and calls regexS...
Definition: regexscan.c:201
char * fo_scheduler_current()
Get the last read string from the scheduler.
void fo_scheduler_disconnect(int retcode)
Disconnect the scheduler connection.
void fo_scheduler_connect(int *argc, char **argv, PGconn **db_conn)
Establish a connection between an agent and the scheduler.
char SQL[256]
For DB.
Definition: regexscan.c:68
int regexScan(regex_t *regex, char *regexStr, FILE *scanFilePtr, char *fileName)
Scan a file for a regex - regular expression. the regex is compiled in this function for performanc...
Definition: regexscan.c:90
FILE * fo_RepFread(char *Type, char *Filename)
Perform an fopen for reading only.
Definition: libfossrepo.c:625
int agent_verbose
Common verbose flags for the agents, this is used so that the scheduler can change the verbose level ...
int pfileNumToNames(char *pfileNum, char *pfileRepoName, char *pfileRealName)
Creates filenames from pfile_pk value.
Definition: regexscan.c:152
PGconn * pgConn
Database connection.
Definition: regexscan.c:78
int fo_scheduler_userID()
Gets the id of the user that created the job that the agent is running.
The main FOSSology C library.
char * fo_scheduler_next()
Get the next data to process from the scheduler.
void Usage(char *Name)
Usage description for this regexscan agent.
Definition: regexscan.c:305
char * fo_sysconfig(const char *sectionname, const char *variablename)
gets a system configuration variable from the configuration data.