FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
regexscan-Stage3.c
Go to the documentation of this file.
1 /***************************************************************
2  regexscan: Scan file(s) for regular expression(s)
3 
4  Copyright (C) 2013 Hewlett-Packard Development Company, L.P.
5 
6  This program is free software; you can redistribute it and/or
7  modify it under the terms of the GNU General Public License
8  version 2 as published by the Free Software Foundation.
9 
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13  GNU General Public License for more details.
14 
15  You should have received a copy of the GNU General Public License along
16  with this program; if not, write to the Free Software Foundation, Inc.,
17  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
18  ***************************************************************/
19 
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include <unistd.h>
38 #include <string.h>
39 #include <ctype.h>
40 #include <signal.h>
41 #include <libgen.h>
42 
43 #include <regex.h>
44 #include <stdbool.h>
45 
46 #include "libfossology.h"
47 
48 #define MAXCMD 4096
49 char SQL[256];
50 
51 #define myBUFSIZ 2048
52 
53 /*
54 #ifdef COMMIT_HASH
55 char BuildVersion[]="Build version: " COMMIT_HASH ".\n";
56 #endif
57 */
58 
59 PGconn *pgConn = NULL;
60 
71 int regexScan(regex_t *regex, char *regexStr, FILE *scanFilePtr, char *fileName)
72 {
73  int retCode;
74 
75 // regex_t regex;
76  bool match = false; /* regex match found indicator */
77 
78  char msgBuff[250];
79  char textBuff[2000]; /* line buffer for regex match processing */
80 
81  regmatch_t rm[1];
82  int lineCount = 0;
83 
84  /* Now scan the file for regex line by line */
85  while (fgets(textBuff, 1024, scanFilePtr) != NULL)
86  {
87  lineCount++; /* Another line read */
88  retCode = regexec(regex, textBuff, 1, rm, 0); /* nmatch = 1, matchptr = rm */
89  if (!retCode)
90  {
91  sprintf(msgBuff, "%s: regex found at line %d at position %d. -> %.*s \n",
92  fileName, lineCount, rm[0].rm_so+1, rm[0].rm_eo-rm[0].rm_so, textBuff + rm[0].rm_so);
93  puts(msgBuff);
94  if (!match)
95  {
96  match = true; /* Indicate we've had at least one match */
97  }
98  }
99  else if (retCode == REG_NOMATCH)
100  {
101  /* Skip the "no match" retCode */
102  }
103  else
104  {
105  regerror(retCode, regex, msgBuff, sizeof(msgBuff));
106  fprintf(stderr, "Out of memory? - regex match failure: %s\n", msgBuff);
107  fclose(scanFilePtr);
108  return 3;
109  }
110  }
111 
112  /* Report if no matches found */
113  if (!match)
114  {
115  sprintf(msgBuff, "%s: %s not found\n", fileName, regexStr);
116  puts(msgBuff);
117  }
118 
119  /* clean up and exit */
120 // regfree(&regex);
121  fclose(scanFilePtr);
122  return 0;
123 }
124 
133 int pfileNumToNames(char *pfileNum, char *pfileRepoName, char *pfileRealName)
134 {
135  char sqlSelect[256];
136  PGresult *result;
137 
138  /* Attempt to locate the appropriate pFile_pk record */
139  sprintf(sqlSelect, "SELECT pfile_sha1, pfile_md5, pfile_size, ufile_name FROM pfile, uploadtree WHERE pfile_fk = pfile_pk and pfile_pk = '%s'", pfileNum);
140  result = PQexec(pgConn, sqlSelect);
141 
142  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
143 
144  /* confirm a sane result set */
145  if (PQntuples(result) == 0)
146  {
147  PQclear(result);
148 
149  /* Not found */
150  fprintf(stderr, "Database does not contain pfile_pk: %s\n", pfileNum);
151  return 1;
152  }
153  else if (PQntuples(result) != 1)
154  {
155  PQclear(result);
156 
157  /* Not found */
158  fprintf(stderr, "Database contains multiple pfile_pk: %s\n", pfileNum);
159  return 2;
160  }
161  /* We've managed to locate the one and only pfile_pk record. Build the filePath string */
162  /* Concatenate first row fields 0, 1 and 2 */
163  sprintf(pfileRepoName, "%s.%s.%s", PQgetvalue(result, 0, 0), PQgetvalue(result, 0, 1), PQgetvalue(result, 0, 2));
164  /* and extract the actual filename from field 4 - uploadtree.ufile_name */
165  sprintf(pfileRealName, "%s", PQgetvalue(result, 0, 3));
166 
167 // fprintf(stderr, "fileName is:%s\n", pFileName);
168  PQclear(result);
169  return 0;
170 }
171 
172 
181 int regexScanUpload(char *uploadNum, char *regexStr)
182 {
183  char sqlSelect[256];
184  PGresult *result, *pfileResult;
185 
186  int fileCount, i, retCode;
187 
188  char fileRealName[1000];
189  char fileRepoName[1000];
190 
191  FILE *scanFilePtr;
192 
193  regex_t regex;
194 
195  /* Ensure uploadNum is "valid" then obtain a list of pfile entries and scan them */
196  sprintf(sqlSelect, "SELECT upload_pk, upload_mode, upload_filename from upload where upload_pk = '%s'", uploadNum);
197  result = PQexec(pgConn, sqlSelect);
198 
199  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
200 
201  /* confirm a sane result set */
202  if (PQntuples(result) == 0)
203  {
204  fprintf(stderr, "No uploads appear to be available here!\n");
205  PQclear(result);
206  return 0; /* nothing found to scan */
207  }
208 
209  /* Next ensure that uploadNum was successfully uploaded */
210  /* We'll only look at upload_pk entries that have successfully run ununpack (64) and adj2nest (32) */
211  if ((atoi(PQgetvalue(result, 0, 1)) & 96) != 96)
212  {
213  fprintf(stderr, "Upload %s was not successfully processed after upload!\n", uploadNum);
214  PQclear(result);
215  return 0; /* nothing found to scan */
216  }
217 
218  /* Now get our list of required pfile entries for this upload */
219  sprintf(sqlSelect, "SELECT uploadtree.pfile_fk, ufile_name from uploadtree, upload"
220  " where upload_fk = upload_pk and uploadtree.pfile_fk <> 0 and ufile_mode = 32768 and upload_pk = '%s'", uploadNum);
221  result = PQexec(pgConn, sqlSelect);
222 
223  if (fo_checkPQresult(pgConn, result, sqlSelect, __FILE__, __LINE__)) return 0;
224 
225  fileCount = PQntuples(result);
226 // fprintf(stderr, "Located %d files to process.\n", fileCount);
227 
228  /* Compile the regex for improved performance */
229  retCode = regcomp(&regex, regexStr, REG_ICASE+REG_EXTENDED);
230  if (retCode)
231  {
232  fprintf(stderr, "regex %s failed to compile\n", regexStr);
233  return 1;
234  }
235 
236  /* Scan the files we've found for this upload */
237  for (i=0; i<fileCount; i++)
238  {
239  /* Attempt to locate the appropriate pFile_pk record */
240  sprintf(sqlSelect, "SELECT pfile_sha1, pfile_md5, pfile_size, ufile_name"
241  " FROM pfile, uploadtree WHERE pfile_fk = pfile_pk and pfile_pk = '%s'", PQgetvalue(result, i, 0));
242  pfileResult = PQexec(pgConn, sqlSelect);
243 
244  if (fo_checkPQresult(pgConn, pfileResult, sqlSelect, __FILE__, __LINE__)) return 0;
245 
246  /* confirm a sane result set */
247  if (PQntuples(pfileResult) == 1)
248  {
249  /* For each pfile value grind through the regex scan process */
250 
251  /* Locate and construct the appropriate full name from pfile table based upon pfile_pk value */
252  if (pfileNumToNames(PQgetvalue(result, i, 0), fileRepoName, fileRealName) != 0)
253  {
254  fprintf(stderr, "ERROR: Unable to locate pfile_pk '%s'\n", PQgetvalue(result, i, 0));
255  return 0;
256  }
257 
258  /* Use fo_RepFread() for access. It uses fo_RepMkPath() to map name to full path. */
259  scanFilePtr = fo_RepFread("files", fileRepoName);
260  if (!scanFilePtr)
261  {
262  fprintf(stderr, "ERROR: Unable to open '%s/%s'\n", "files", fileRepoName);
263  return 0;
264  }
265 
266  /* Call scan function. Note that we'll need to "Humanize" the fileName at some point. */
267  regexScan(&regex, regexStr, scanFilePtr, fileRealName);
268  }
269  else
270  {
271  fprintf(stderr, "WARNING: File: %s - Located %d instances of pfile_pk %s ! Size = %s bytes!\n",
272  PQgetvalue(result, i, 1), PQntuples(pfileResult), PQgetvalue(result, i, 0), PQgetvalue(pfileResult, i, 2));
273  }
274  }
275  /* return the number of scanned files */
276  return i;
277 }
278 
279 
284 void Usage (char *Name)
285 {
286  printf("Usage: %s [options] [id [id ...]]\n",Name);
287  printf(" -i :: initialize the database, then exit.\n");
288  printf(" -c SYSCONFDIR :: FOSSology configuration directory.\n");
289  printf(" -h :: show available command line options.\n");
290  printf(" -v :: increase agent logging verbosity.\n");
291  printf(" -r :: regex expression to load from command line.\n");
292  printf(" filename :: filename to process with regex.\n");
293 } /* Usage() */
294 
295 /*********************************************************/
296 int main (int argc, char *argv[])
297 {
298  int nonoptargs;
299  int c, retCode;
300 
301  regex_t regex;
302 
303  char regexStr[1024]; /* string storage for the regex expression */
304  bool regexSet = false;
305 
306  char fileName[1000];
307  FILE *scanFilePtr;
308 
309  char uploadNum[10];
310 
311  int scannedCount = 0;
312 
313  int user_pk;
314  long UploadPK=-1;
315 
316  char *COMMIT_HASH;
317  char *VERSION;
318  char agent_rev[myBUFSIZ];
319 
320  /* connect to scheduler. Noop if not run from scheduler. */
321  fo_scheduler_connect(&argc, argv, &pgConn);
322 
323 /*
324  Version reporting.
325 */
326  COMMIT_HASH = fo_sysconfig("regexscan", "COMMIT_HASH");
327  VERSION = fo_sysconfig("regexscan", "VERSION");
328  sprintf(agent_rev, "%s.%s", VERSION, COMMIT_HASH);
329 #ifdef REGEX_DEBUG
330  fprintf(stdout, "regexscan reports version info as '%s.%s'.\n", VERSION, COMMIT_HASH);
331 #endif
332 
333  /* Process command-line */
334  while((c = getopt(argc,argv,"chir:v")) != -1)
335  {
336  switch(c)
337  {
338  case 'c':
339  break; /* handled by fo_scheduler_connect() */
340  case 'i':
341  PQfinish(pgConn);
342  return(0);
343  case 'r':
344  sprintf(regexStr, "%s", optarg);
345  regexSet = true;
346  break;
347  case 'v':
348  agent_verbose++;
349  break;
350  case 'h':
351  default:
352  Usage(argv[0]);
353  fflush(stdout);
354  PQfinish(pgConn);
355  exit(-1);
356  }
357  }
358 
359  /* Sanity check for regex value required here. */
360  if (!regexSet)
361  {
362  fprintf (stderr, "No regex value has been requested!\n");
363  PQfinish(pgConn);
365  return 1;
366  }
367 
368  /* process filename after switches. How many non-option arguments are there ? */
369  nonoptargs = argc - optind; /* total argument count minus the option count */
370 
371  if (nonoptargs == 0)
372  {
373  /* Assume it was a scheduler call */
374  user_pk = fo_scheduler_userID();
375 
376  while(fo_scheduler_next())
377  {
378  UploadPK = atol(fo_scheduler_current());
379 
380  printf("UploadPK is: %ld\n", UploadPK);
381  sprintf(uploadNum, "%ld", UploadPK);
382  scannedCount = regexScanUpload(uploadNum, regexStr);
383  if (scannedCount == 0)
384  {
385  fprintf(stderr, "Failed to successfully scan: upload - %s!\n", uploadNum);
386  }
387  }
388  }
389  else
390  {
391  /* File access initialization - For Stage 3 use first arg as fileName */
392  sprintf(fileName, "%s", argv[optind]); /* Grab first non-switch argument as filename */
393 
394  scanFilePtr = fopen(fileName, "r");
395  if (!scanFilePtr)
396  {
397  fprintf(stderr, "ERROR: Unable to open '%s'\n", fileName);
398  PQfinish(pgConn);
400  }
401 
402  /* Compile the regex for improved performance */
403  retCode = regcomp(&regex, regexStr, REG_ICASE+REG_EXTENDED);
404  if (retCode)
405  {
406  fprintf(stderr, "regex %s failed to compile\n", regexStr);
407  PQfinish(pgConn);
409  }
410 
411  /* Now call the function that scans a file for a regex */
412  retCode = regexScan(&regex, (char *)regexStr, scanFilePtr, (char *)fileName);
413 // retCode = regexScan(uploadNum, regexStr);
414  if (retCode != 0)
415  {
416  fprintf(stderr, "Failed to successfully scan: %s!\n", fileName);
417  }
418 
419  }
420 
421  PQfinish(pgConn);
423 
424  return 0;
425 } /* main() */
426 
int fo_checkPQresult(PGconn *pgConn, PGresult *result, char *sql, char *FileID, int LineNumb)
Check the result status of a postgres SELECT.
Definition: libfossdb.c:181
void Usage(char *Name)
Usage description for this regexscan agent.
Store the results of a regex match.
Definition: scanners.hpp:39
char * fo_scheduler_current()
Get the last read string from the scheduler.
PGconn * pgConn
Database connection.
void fo_scheduler_disconnect(int retcode)
Disconnect the scheduler connection.
void fo_scheduler_connect(int *argc, char **argv, PGconn **db_conn)
Establish a connection between an agent and the scheduler.
FILE * fo_RepFread(char *Type, char *Filename)
Perform an fopen for reading only.
Definition: libfossrepo.c:625
int agent_verbose
Common verbose flags for the agents, this is used so that the scheduler can change the verbose level ...
int fo_scheduler_userID()
Gets the id of the user that created the job that the agent is running.
The main FOSSology C library.
char * fo_scheduler_next()
Get the next data to process from the scheduler.
char SQL[256]
For DB.
int pfileNumToNames(char *pfileNum, char *pfileRepoName, char *pfileRealName)
Creates filenames from pfile_pk value.
int regexScan(regex_t *regex, char *regexStr, FILE *scanFilePtr, char *fileName)
Scan a file for a regex - regular expression. the regex is compiled in this function for performanc...
char * fo_sysconfig(const char *sectionname, const char *variablename)
gets a system configuration variable from the configuration data.
int regexScanUpload(char *uploadNum, char *regexStr)
Scan an Upload for a regex - regular expression. gets a list of files in an upload and calls regexS...