FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
copyrightUtils.cc
Go to the documentation of this file.
1 /*
2  * Copyright (C) 2014-2018, Siemens AG
3  * Author: Daniele Fognini, Johannes Najjar
4  *
5  * This program is free software; you can redistribute it and/or modify
6  * it under the terms of the GNU General Public License version 2
7  * as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
12  * See the GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write to the Free Software Foundation,
16  * Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17  */
23 #include "copyrightUtils.hpp"
24 #include <boost/program_options.hpp>
25 
26 #include <iostream>
27 #include <sstream>
28 
29 using namespace std;
30 
36 int queryAgentId(PGconn* dbConn)
37 {
38  char* COMMIT_HASH = fo_sysconfig(AGENT_NAME, "COMMIT_HASH");
39  char* VERSION = fo_sysconfig(AGENT_NAME, "VERSION");
40  char* agentRevision;
41  if (!asprintf(&agentRevision, "%s.%s", VERSION, COMMIT_HASH))
42  {
43  exit(-1);
44  };
45 
46  int agentId = fo_GetAgentKey(dbConn,
47  AGENT_NAME, 0, agentRevision, AGENT_DESC);
48  free(agentRevision);
49 
50  if (agentId > 0)
51  {
52  return agentId;
53  }
54  else
55  {
56  exit(1);
57  }
58 }
59 
64 int writeARS(int agentId, int arsId, int uploadId, int success, const fo::DbManager& dbManager)
65 {
66  return fo_WriteARS(dbManager.getConnection(), arsId, uploadId, agentId, AGENT_ARS, NULL, success);
67 }
68 
73 void bail(int exitval)
74 {
75  fo_scheduler_disconnect(exitval);
76  exit(exitval);
77 }
78 
89 bool parseCliOptions(int argc, char** argv, CliOptions& dest,
90  std::vector<std::string>& fileNames, std::string& directoryToScan)
91 {
92  unsigned type = 0;
93 
94  boost::program_options::options_description desc(IDENTITY ": recognized options");
95  desc.add_options()
96  ("help,h", "shows help")
97  (
98  "type,T",
99  boost::program_options::value<unsigned>(&type)
100  ->default_value(ALL_TYPES),
101  "type of regex to try"
102  ) // TODO change and add help based on IDENTITY
103  (
104  "verbose,v", "increase verbosity"
105  )
106  (
107  "regex",
108  boost::program_options::value<vector<string> >(),
109  "user defined Regex to search: [{name=cli}@@][{matchingGroup=0}@@]{regex} e.g. 'linux@@1@@(linus) torvalds'"
110  )
111  (
112  "files",
113  boost::program_options::value< vector<string> >(),
114  "files to scan"
115  )
116  (
117  "json,J", "output JSON"
118  )
119  (
120  "ignoreFilesWithMimeType,I", "ignoreFilesWithMimeType"
121  )
122  (
123  "config,c", boost::program_options::value<string>(), "path to the sysconfigdir"
124  )
125  (
126  "scheduler_start", "specifies, that the command was called by the scheduler"
127  )
128  (
129  "userID", boost::program_options::value<int>(), "the id of the user that created the job (only in combination with --scheduler_start)"
130  )
131  (
132  "groupID", boost::program_options::value<int>(), "the id of the group of the user that created the job (only in combination with --scheduler_start)"
133  )
134  (
135  "jobId", boost::program_options::value<int>(), "the id of the job (only in combination with --scheduler_start)"
136  )
137  (
138  "directory,d", boost::program_options::value<string>(), "directory to scan (recursive)"
139  )
140  ;
141 
142  boost::program_options::positional_options_description p;
143  p.add("files", -1);
144 
145  boost::program_options::variables_map vm;
146 
147  try
148  {
149  boost::program_options::store(
150  boost::program_options::command_line_parser(argc, argv).options(desc).positional(p).run(), vm);
151 
152  type = vm["type"].as<unsigned>();
153 
154  if ((vm.count("help") > 0) || (type > ALL_TYPES))
155  {
156  cout << desc << endl;
157  exit(0);
158  }
159 
160  if (vm.count("files"))
161  {
162  fileNames = vm["files"].as<std::vector<string> >();
163  }
164 
165  unsigned long verbosity = vm.count("verbose");
166  bool json = vm.count("json") > 0 ? true : false;
167  bool ignoreFilesWithMimeType = vm.count("ignoreFilesWithMimeType") > 0 ? true : false;
168 
169  dest = CliOptions(verbosity, type, json, ignoreFilesWithMimeType);
170 
171  if (vm.count("regex"))
172  {
173  const std::vector<std::string>& userRegexesFmts = vm["regex"].as<vector<std::string> >();
174  for (auto it = userRegexesFmts.begin(); it != userRegexesFmts.end(); ++it) {
175  scanner* sc = makeRegexScanner(*it, "cli");
176  if (!(sc))
177  {
178  cout << "cannot parse regex format : " << *it << endl;
179  return false;
180  }
181  else
182  {
183  dest.addScanner(sc);
184  }
185  }
186  }
187 
188  if (vm.count("directory"))
189  {
190  if (vm.count("files"))
191  {
192  cout << "cannot pass files and directory at the same time" << endl;
193  cout << desc << endl;
194  fileNames.clear();
195  return false;
196  }
197  directoryToScan = vm["directory"].as<std::string>();
198  }
199 
200  return true;
201  }
202  catch (boost::bad_any_cast&) {
203  cout << "wrong parameter type" << endl;
204  cout << desc << endl;
205  return false;
206  }
207  catch (boost::program_options::error&)
208  {
209  cout << "wrong command line arguments" << endl;
210  cout << desc << endl;
211  return false;
212  }
213 }
214 
220 {
221  unsigned types = state.getCliOptions().getOptType();
222 #ifdef IDENTITY_COPYRIGHT
223  if (types & 1<<0)
224  //state.addMatcher(RegexMatcher(regCopyright::getType(), regCopyright::getRegex()));
225  state.addScanner(new hCopyrightScanner());
226 
227  if (types & 1<<1)
228  state.addScanner(new regexScanner("url", "copyright"));
229 
230  if (types & 1<<2)
231  state.addScanner(new regexScanner("email", "copyright", 1));
232 
233  if (types & 1<<3)
234  state.addScanner(new regexScanner("author", "copyright"));
235 #endif
236 
237 #ifdef IDENTITY_ECC
238  if (types & 1<<0)
239  state.addScanner(new regexScanner("ecc", "ecc"));
240 #endif
241 #ifdef IDENTITY_KW
242  if (types & 1<<0)
243  state.addScanner(new regexScanner("keyword", "keyword"));
244 #endif
245 }
246 
253 scanner* makeRegexScanner(const std::string& regexDesc, const std::string& defaultType) {
254  #define RGX_FMT_SEPARATOR "@@"
255  auto fmtRegex = rx::regex(
256  "(?:([[:alpha:]]+)" RGX_FMT_SEPARATOR ")?(?:([[:digit:]]+)" RGX_FMT_SEPARATOR ")?(.*)",
257  rx::regex_constants::icase
258  );
259 
260  rx::match_results<std::string::const_iterator> match;
261  if (rx::regex_match(regexDesc.begin(), regexDesc.end(), match, fmtRegex))
262  {
263  std::string type(match.length(1) > 0 ? match.str(1) : defaultType.c_str());
264 
265  int regId = match.length(2) > 0 ? std::stoi(std::string(match.str(2))) : 0;
266 
267  if (match.length(3) == 0)
268  return 0; // nullptr
269 
270  std::istringstream stream;
271  stream.str(type + "=" + match.str(3));
272  return new regexScanner(type, stream, regId);
273  }
274  return 0; // nullptr
275 }
276 
285 {
286  CopyrightState state(std::move(cliOptions));
287  addDefaultScanners(state);
288 
289  return state;
290 }
291 
301 bool saveToDatabase(const string& s, const list<match>& matches, unsigned long pFileId, int agentId, const CopyrightDatabaseHandler& copyrightDatabaseHandler)
302 {
303  if (!copyrightDatabaseHandler.begin())
304  {
305  return false;
306  }
307 
308  size_t count = 0;
309  for (auto m = matches.begin(); m != matches.end(); ++m)
310  {
311 
312  DatabaseEntry entry;
313  entry.agent_fk = agentId;
314  entry.content = cleanMatch(s, *m);
315  entry.copy_endbyte = m->end;
316  entry.copy_startbyte = m->start;
317  entry.pfile_fk = pFileId;
318  entry.type = m->type;
319 
320  if (entry.content.length() != 0)
321  {
322  ++count;
323  if (!copyrightDatabaseHandler.insertInDatabase(entry))
324  {
325  copyrightDatabaseHandler.rollback();
326  return false;
327  };
328  }
329  }
330 
331  return copyrightDatabaseHandler.commit();
332 }
333 
342 void matchFileWithLicenses(const string& sContent, unsigned long pFileId, CopyrightState const& state, int agentId, CopyrightDatabaseHandler& databaseHandler)
343 {
344  list<match> l;
345  const list<unptr::shared_ptr<scanner>>& scanners = state.getScanners();
346  for (auto sc = scanners.begin(); sc != scanners.end(); ++sc)
347  {
348  (*sc)->ScanString(sContent, l);
349  }
350  saveToDatabase(sContent, l, pFileId, agentId, databaseHandler);
351 }
352 
366 void matchPFileWithLicenses(CopyrightState const& state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler& databaseHandler)
367 {
368  char* pFile = databaseHandler.getPFileNameForFileId(pFileId);
369 
370  if (!pFile)
371  {
372  cout << "File not found " << pFileId << endl;
373  bail(8);
374  }
375 
376  char* fileName = NULL;
377  {
378 #pragma omp critical (repo_mk_path)
379  fileName = fo_RepMkPath("files", pFile);
380  }
381  if (fileName)
382  {
383  string s;
384  ReadFileToString(fileName, s);
385 
386  matchFileWithLicenses(s, pFileId, state, agentId, databaseHandler);
387 
388  free(fileName);
389  free(pFile);
390  }
391  else
392  {
393  cout << "PFile not found in repo " << pFileId << endl;
394  bail(7);
395  }
396 }
397 
410 bool processUploadId(const CopyrightState& state, int agentId, int uploadId, CopyrightDatabaseHandler& databaseHandler, bool ignoreFilesWithMimeType)
411 {
412  vector<unsigned long> fileIds = databaseHandler.queryFileIdsForUpload(agentId, uploadId, ignoreFilesWithMimeType);
413 
414 #pragma omp parallel
415  {
416  CopyrightDatabaseHandler threadLocalDatabaseHandler(databaseHandler.spawn());
417 
418  size_t pFileCount = fileIds.size();
419 #pragma omp for
420  for (size_t it = 0; it < pFileCount; ++it)
421  {
422  unsigned long pFileId = fileIds[it];
423 
424  if (pFileId == 0)
425  {
426  continue;
427  }
428 
429  matchPFileWithLicenses(state, agentId, pFileId, threadLocalDatabaseHandler);
430 
432  }
433  }
434 
435  return true;
436 }
437 
444 pair<string, list<match>> processSingleFile(const CopyrightState& state,
445  const string fileName)
446 {
447  const list<unptr::shared_ptr<scanner>>& scanners = state.getScanners();
448  list<match> matchList;
449 
450  // Read file into one string
451  string s;
452  if (!ReadFileToString(fileName, s))
453  {
454  // File error
455  s = "";
456  }
457  else
458  {
459  for (auto sc = scanners.begin(); sc != scanners.end(); ++sc)
460  {
461  (*sc)->ScanString(s, matchList);
462  }
463  }
464  return make_pair(s, matchList);
465 }
466 
474 void appendToJson(const std::string fileName,
475  const std::pair<string, list<match>> resultPair, bool &printComma)
476 {
477  Json::Value result;
478 #if JSONCPP_VERSION_HEXA < ((1 << 24) | (4 << 16))
479  // Use FastWriter for versions below 1.4.0
480  Json::FastWriter jsonWriter;
481 #else
482  // Since version 1.4.0, FastWriter is deprecated and replaced with
483  // StreamWriterBuilder
484  Json::StreamWriterBuilder jsonWriter;
485  jsonWriter["commentStyle"] = "None";
486  jsonWriter["indentation"] = "";
487 #endif
488 
489  if (resultPair.first.empty())
490  {
491  result["file"] = fileName;
492  result["results"] = "Unable to read file";
493  }
494  else
495  {
496  list<match> resultList = resultPair.second;
497  Json::Value results;
498  for (auto m : resultList)
499  {
500  Json::Value j;
501  j["start"] = m.start;
502  j["end"] = m.end;
503  j["type"] = m.type;
504  j["content"] = cleanMatch(resultPair.first, m);
505  results.append(j);
506  }
507  result["file"] = fileName;
508  result["results"] = results;
509  }
510  // Thread-Safety: output all matches JSON at once to STDOUT
511 #pragma omp critical (jsonPrinter)
512  {
513  if (printComma)
514  {
515  cout << "," << endl;
516  }
517  else
518  {
519  printComma = true;
520  }
521  string jsonString;
522 #if JSONCPP_VERSION_HEXA < ((1 << 24) | (4 << 16))
523  // For version below 1.4.0, every writer append `\n` at end.
524  // Find and replace it.
525  jsonString = jsonWriter.write(result);
526  jsonString.replace(jsonString.find("\n"), string("\n").length(), "");
527 #else
528  // For version >= 1.4.0, \n is not appended.
529  jsonString = Json::writeString(jsonWriter, result);
530 #endif
531  cout << " " << jsonString << flush;
532  }
533 }
534 
540 void printResultToStdout(const std::string fileName,
541  const std::pair<string, list<match>> resultPair)
542 {
543  if (resultPair.first.empty())
544  {
545  cout << fileName << " :: Unable to read file" << endl;
546  return;
547  }
548  stringstream ss;
549  ss << fileName << " ::" << endl;
550  // Output matches
551  list<match> resultList = resultPair.second;
552  for (auto m = resultList.begin(); m != resultList.end(); ++m)
553  {
554  ss << "\t[" << m->start << ':' << m->end << ':' << m->type << "] '"
555  << cleanMatch(resultPair.first, *m)
556  << "'" << endl;
557  }
558  // Thread-Safety: output all matches (collected in ss) at once to cout
559  cout << ss.str();
560 }
bool ReadFileToString(const string &fileName, string &out)
Utility: read file to string from scanners.h.
Definition: scanners.cc:32
void bail(int exitval)
Disconnect with scheduler returning an error code and exit.
void printResultToStdout(const std::string fileName, const std::pair< string, list< match >> resultPair)
DB wrapper for agents.
const std::list< unptr::shared_ptr< scanner > > & getScanners() const
Get available scanner s.
Store the results of a regex match.
Definition: scanners.hpp:39
std::vector< unsigned long > queryFileIdsForUpload(int agentId, int uploadId, bool ignoreFilesWithMimeType)
Get the list of pfile ids on which the given agent has no findings for a given upload.
Definition: database.cc:317
bool commit() const
COMMIT a transaction block in DB.
int s
The socket that the CLI will use to communicate.
Definition: fo_cli.c:48
CopyrightState getState(CliOptions &&cliOptions)
Create a new state for the current agent based on CliOptions.
bool rollback() const
ROLLBACK a transaction block in DB.
void appendToJson(const std::string fileName, const std::pair< string, list< match >> resultPair, bool &printComma)
void fo_scheduler_disconnect(int retcode)
Disconnect the scheduler connection.
bool insertInDatabase(DatabaseEntry &entry) const
Insert a finding in database.
Definition: database.cc:385
char * getPFileNameForFileId(unsigned long pfileId) const
Get the file name of a give pfile id.
int copy_endbyte
Definition: database.hpp:56
Manages database related requests for agent.
Definition: database.hpp:63
PGconn * getConnection() const
void matchFileWithLicenses(const string &sContent, unsigned long pFileId, CopyrightState const &state, int agentId, CopyrightDatabaseHandler &databaseHandler)
Scan a given file with all available scanners and save findings to database.
scanner * makeRegexScanner(const std::string &regexDesc, const std::string &defaultType)
Make a boost regex scanner object based on regex desc and type.
std::string type
Type of statement found.
Definition: database.hpp:54
Provides a regex scanner using predefined regexs.
Definition: regscan.hpp:31
Abstract class to provide interface to scanners.
Definition: scanners.hpp:62
void addScanner(scanner *regexDesc)
Add scanner to CliOptions.
static void addDefaultScanners(CopyrightState &state)
Add default scanners to the agent state.
Holds information about state of one agent.
bool processUploadId(const CopyrightState &state, int agentId, int uploadId, CopyrightDatabaseHandler &databaseHandler, bool ignoreFilesWithMimeType)
Process a given upload id, scan from statements and add to database.
bool parseCliOptions(int argc, char **argv, CliOptions &dest, std::vector< std::string > &fileNames, std::string &directoryToScan)
Parse the options sent by CLI to CliOptions object.
void addScanner(scanner *scanner)
Add scanner to state.
const CliOptions & getCliOptions() const
Get the CliOptions set by user.
bool begin() const
BEGIN a transaction block in DB.
char * fo_RepMkPath(const char *Type, char *Filename)
Given a filename, construct the full path to the file.
Definition: libfossrepo.c:364
fo_dbManager * dbManager
fo_dbManager object
Definition: process.c:28
int queryAgentId(PGconn *dbConn)
Get agent id, exit if agent id is incorrect.
std::string content
Definition: database.hpp:42
int copy_startbyte
Definition: database.hpp:55
FUNCTION int fo_GetAgentKey(PGconn *pgConn, const char *agent_name, long Upload_pk, const char *rev, const char *agent_desc)
Get the latest enabled agent key (agent_pk) from the database.
Definition: libfossagent.c:172
FUNCTION int fo_WriteARS(PGconn *pgConn, int ars_pk, int upload_pk, int agent_pk, const char *tableName, const char *ars_status, int ars_success)
Write ars record.
Definition: libfossagent.c:228
void fo_scheduler_heart(int i)
This function must be called by agents to let the scheduler know they are alive and how many items th...
bool saveToDatabase(const string &s, const list< match > &matches, unsigned long pFileId, int agentId, const CopyrightDatabaseHandler &copyrightDatabaseHandler)
Save findings to the database if agent was called by scheduler.
Store the options sent through the CLI.
int writeARS(int agentId, int arsId, int uploadId, int success, const fo::DbManager &dbManager)
Call C function fo_WriteARS() and translate the arguments.
Implementation of scanner class for copyright.
Definition: copyscan.hpp:29
void matchPFileWithLicenses(CopyrightState const &state, int agentId, unsigned long pFileId, CopyrightDatabaseHandler &databaseHandler)
Get the file contents, scan for statements and save findings to database.
Maps agent data to database schema.
Definition: database.hpp:35
pair< string, list< match > > processSingleFile(const CopyrightState &state, const string fileName)
unsigned int getOptType() const
Get the opt type set by CliOptions.
string cleanMatch(const string &sText, const match &m)
Clean the text based on type.
Definition: cleanEntries.cc:83
char * fo_sysconfig(const char *sectionname, const char *variablename)
gets a system configuration variable from the configuration data.
CopyrightDatabaseHandler spawn() const
Spawn/fork a new database handler and return it.
Definition: database.cc:52