FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
parse.c File Reference

searches for licenses More...

#include <ctype.h>
#include "nomos.h"
#include "parse.h"
#include "list.h"
#include "util.h"
#include "nomos_regex.h"
#include "nomos_utils.h"
#include "_autodefs.h"
Include dependency graph for parse.c:

Go to the source code of this file.

Macros

license definitions

Instead of keeping a potentially-growing list of variables used to recall specific flags/text, etc., manage it in an array. A little slower, sure, but it keeps the number of variables we allocate to a more-reasonable minimum.

#define _mGPL   0
 
#define _mLGPL   1
 
#define _mGFDL   2
 
#define _mQPL   3
 
#define _mPYTHON   4
 
#define _mPYTH_TEXT   5
 
#define _mAPACHE   6
 
#define _mHP   7
 
#define _mPHP   8
 
#define _mMIT   9
 
#define _mXOPEN   10
 
#define _mREDHAT   11
 
#define _mISC   12
 
#define _mCMU   13
 
#define _mOSF   14
 
#define _mSUN   15
 
#define _mALADDIN   16
 
#define _mCUPS   17
 
#define _fOPENLDAP   18
 
#define _fBSD   19
 
#define _fGPL   20
 
#define _mCDDL   21
 
#define _mLIBRE   22
 
#define _mGSOAP   23
 
#define _mMPL   24
 
#define _fATTRIB   25
 
#define _fREAL   26
 
#define _fIETF   27
 
#define _fDOC   28
 
#define _fMSCORP   29
 
#define _fW3C   30
 
#define _mAPTANA   31
 
#define _tOPENLDAP   32
 
#define _mNTP   33
 
#define _fIP   34
 
#define _fANTLR   35
 
#define _fCCBY   36
 
#define _fZPL   37
 
#define _fCLA   38
 
#define _fODBL   39
 
#define _fPDDL   40
 
#define _fRUBY   41
 
#define _fSAX   42
 
#define _fAPL   43
 
#define _fARTISTIC   44
 
#define _fCITRIX   45
 
#define _fPURDUE   46
 
#define _fUNICODE   47
 
#define _fOFL   48
 
#define _mAPACHE10   49
 
#define _mAPACHE11   50
 
#define _mWORDNET   51
 
#define _fNCSA   52
 
#define _fTCL   53
 
#define _fIJG   54
 
#define _msize   _fIJG+1
 
micro function definitions

These #define's save a LOT of typing and indentation... :)

#define PARSE_ARGS   filetext, size, isML, isPS
 Arguments to parse.
 
#define LVAL(x)   (ltsr[x] & LTSR_RMASK)
 Check LTSR_RMASK on lstr[x].
 
#define SEEN(x)   (ltsr[x] & LTSR_SMASK)
 Check LTSR_SMASK on lstr[x].
 
#define INFILE(x)   fileHasPatt(x, PARSE_ARGS, 0)
 Calls fileHasPatt()
 
#define NOT_INFILE(x)   !( fileHasPatt(x, PARSE_ARGS, 0) && clearLastElementOfLicenceBuffer() )
 Calls fileHasPatt()
 
#define RM_INFILE(x)   fileHasPatt(x, PARSE_ARGS, 1)
 Calls fileHasPatt() with qType 1.
 
#define GPL_INFILE(x)   fileHasPatt(x, PARSE_ARGS, 2)
 Calls fileHasPatt() with qType 2.
 
#define PERL_INFILE(x)   fileHasPatt(x, PARSE_ARGS, 3)
 Calls fileHasPatt() with qType 3.
 
#define NY_INFILE(x)   fileHasPatt(x, PARSE_ARGS, 4)
 Calls fileHasPatt() with qType 4.
 
#define X_INFILE(x, y)   fileHasPatt(x, PARSE_ARGS, y)
 Calls fileHasPatt() with qType y.
 
#define DEBUG_INFILE(x)   printf(" Regex[%d] = \"%s\"\nINFILE(%d) = %d\n", x, _REGEX(x), x, INFILE(x));
 Debug print.
 
#define HASREGEX(x, cp)   idxGrep(x, cp, REG_ICASE|REG_EXTENDED)
 Calls idxGrep()
 
#define HASREGEX_RI(x, cp)   idxGrep_recordIndex(x, cp, REG_ICASE|REG_EXTENDED)
 Calls idxGrep_recordIndex()
 
#define HASTEXT(x, fl)   idxGrep_recordIndex(x, filetext, REG_ICASE|fl)
 Calls idxGrep_recordIndex()
 
#define URL_INFILE(x)   (INFILE(x) || fileHasPatt(x, PARSE_ARGS, -1))
 Check in file with qType 0|1.
 
#define CANSKIP(i, x, y, z)   ((i >= y) && (i <= z) && !(kwbm & (1 << (x - _KW_first))))
 
#define HASKW(x, y)   (x & (1 << (y - _KW_first)))
 
#define TRYGROUP(x)   x(PARSE_ARGS)
 
#define LOWINTEREST(x)   addRef(x, IL_LOW)
 
#define MEDINTEREST(x)   addRef(x, IL_MED)
 
#define INTERESTING(x)   addRef(x, IL_HIGH)
 
#define ASLVERS()   aslVersion(PARSE_ARGS)
 
#define CCVERS()   ccVersion(PARSE_ARGS)
 
#define AFLVERS()   aflVersion(PARSE_ARGS)
 
#define OSLVERS()   oslVersion(PARSE_ARGS)
 
#define CPLVERS()   cplVersion(PARSE_ARGS)
 
#define GPLVERS()   gplVersion(PARSE_ARGS)
 
#define LGPLVERS()   lgplVersion(PARSE_ARGS)
 
#define AGPLVERS()   agplVersion(PARSE_ARGS)
 
#define GFDLVERS()   gfdlVersion(PARSE_ARGS)
 
#define CDDLVERS()   cddlVersion(PARSE_ARGS)
 
#define LPPLVERS()   lpplVersion(PARSE_ARGS)
 
#define MPLVERS()   mplNplVersion(PARSE_ARGS)
 
#define PYTHVERS()   pythonVersion(PARSE_ARGS)
 
#define SISSLVERS()   sisslVersion(PARSE_ARGS)
 
#define REALVERS(x)   realVersion(PARSE_ARGS, x)
 
#define PR_REGEX(x)   printf("check %d = %s\n", x, _REGEX(x));
 
#define mCR_CMU()   (INFILE(_CR_CMU_1) || INFILE(_CR_CMU_2))
 
#define mCR_EDIN()   (INFILE(_CR_EDINBURGH_1) || INFILE(_CR_EDINBURGH_2))
 
#define mCR_FSF()   (INFILE(_CR_FSF1) || INFILE(_CR_FSF2))
 
#define mCR_HP()   (INFILE(_CR_HP_1)|| INFILE(_CR_HP_2) || INFILE(_CR_DEC) || INFILE(_CR_EDS))
 
#define mCR_IETF()   (INFILE(_CR_IETF_1) || INFILE(_CR_IETF_2))
 
#define mCR_MIT()   (INFILE(_CR_MIT1) || INFILE(_CR_MIT2))
 
#define mCR_X11()   (INFILE(_CR_X11) || INFILE(_CR_XFREE86))
 
#define mCR_IPTC()   (INFILE(_CR_IPTC1) || INFILE(_CR_IPTC2))
 
#define SPDXREF()   spdxReference(PARSE_ARGS)
 
#define EXCEPTIONS()   copyleftExceptions(PARSE_ARGS)
 

Functions

void preloadResults (char *filetext, char *ltsr)
 
static int fileHasPatt (int licTextIdx, char *filetext, int size, int isML, int isPS, int qType)
 Checks for a phrase in a file. More...
 
char * parseLicenses (char *filetext, int size, scanres_t *scp, int isML, int isPS)
 Parse a file to check all the possible licenses and add them to matches. More...
 
local static functions

Local (static) Functions

int findPhrase (int index, char *filetext, int size, int isML, int isPS, int qType)
 Check for the presence of a phrase in a file by first searching for the search key provided. More...
 
int famOPENLDAP (char *filetext, int size, int isML, int isPS)
 Utility function to search for OpenLDAP licenses. So many different footprints are used by OpenLDAP, we had to either duplicate code in several places, or funnel it all into one function.
 
int checkUnclassified (char *filetext, int size, int score, int isML, int isPS, int nw)
 This function is called when all the above license-checks don't turn up anything useful. Now we need to determine if the current file likely contains a license or not. More...
 
int checkPublicDomain (char *, int, int, int, int, int)
 
static int dbgIdxGrep (int licTextIdx, char *buf, int show)
 Debugging call for idxGrep() More...
 
void checkCornerCases (char *filetext, int size, int score, int kwbm, int isML, int isPS, int nw, int force)
 If we call this function, we still don't know anything about a license. More...
 
void checkFileReferences (char *filetext, int size, int score, int kwbm, int isML, int isPS)
 Generic license-phrases referring to other files or running commands.
 
void addRef (char *str, int interest)
 This function fills in a character-buffer for a license of a CURRENT file being evaluated, and enqueues a list if components to help make a package-level summary.
 
void locateRegex (char *text, item_t *op, int index, int size, int sso, int seo)
 Locate a regex in a given file. More...
 
void saveRegexLocation (int index, int offset, int length, int saveCache)
 Save a regex in whereList. More...
 
void saveUnclBufLocation (int)
 
void saveLicenseParagraph (char *, int, int, int)
 
char * cplVersion (char *filetext, int size, int isML, int isPS)
 Check for CPL versions. More...
 
static char * gplVersion (char *filetext, int size, int isML, int isPS)
 Check for GPL versions. More...
 
char * lgplVersion (char *filetext, int size, int isML, int isPS)
 Check for LGPL versions. More...
 
char * agplVersion (char *filetext, int size, int isML, int isPS)
 Check for AGPL versions. More...
 
char * gfdlVersion (char *filetext, int size, int isML, int isPS)
 Check for GFDL versions. More...
 
char * lpplVersion (char *filetext, int size, int isML, int isPS)
 Check for LPPL versions. More...
 
char * mplNplVersion (char *filetext, int size, int isML, int isPS)
 Check for MPL|NPL versions. More...
 
char * pythonVersion (char *filetext, int size, int isML, int isPS)
 Check for python versions. More...
 
static char * realVersion (char *filetext, int size, int isML, int isPS, int ref)
 Check for RPSL versions. More...
 
static char * sisslVersion (char *filetext, int size, int isML, int isPS)
 Check for SISSL versions. More...
 
char * aslVersion (char *filetext, int size, int isML, int isPS)
 Check for ASL Apache versions. More...
 
char * cddlVersion (char *filetext, int size, int isML, int isPS)
 Check for CDDL versions. More...
 
char * ccVersion (char *filetext, int size, int isML, int isPS)
 Check for CC_BY-X versions. More...
 
char * oslVersion (char *filetext, int size, int isML, int isPS)
 Check for OSL versions. More...
 
char * aflVersion (char *filetext, int size, int isML, int isPS)
 Check for AFL versions. More...
 
static int match3 (int, char *, int, int, int, int)
 
void spdxReference (char *, int, int, int)
 
void copyleftExceptions (char *, int, int, int)
 

Variables

struct {
   char *   base
 
   int   sso
 
   int   seo
 
   int   index
 
kludge
 
local variables

File local variables

static char licStr [myBUFSIZ]
 
static char ltsr [NFOOTPRINTS]
 
static char name [256]
 
static char lmem [_msize]
 
static list_t searchList
 
static list_t whereList
 
static list_t whCacheList
 
static int refOffset
 
static int maxInterest
 
static int pd
 
static int crCheck
 
static int checknw
 
static int lDebug = 0
 
static int lDiags = 0
 

Detailed Description

searches for licenses

The main workhorse of nomos. This file contains most of the logic for finding licenses in nomos.

Definition in file parse.c.

Function Documentation

char * aflVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for AFL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7678 of file parse.c.

char * agplVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for AGPL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7904 of file parse.c.

char * aslVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for ASL Apache versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return ASL Apache license shortname

Definition at line 7272 of file parse.c.

char * ccVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for CC_BY-X versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 8354 of file parse.c.

char * cddlVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for CDDL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7758 of file parse.c.

void checkCornerCases ( char *  filetext,
int  size,
int  score,
int  kwbm,
int  isML,
int  isPS,
int  nw,
int  force 
)

If we call this function, we still don't know anything about a license.

In fact, there may be NO license. Look for copyrights, references to the word "trademark", "patent", etc. (and possibly other trivial (or borderline-insignificant) legal stuff in this file.

Todo:
Remove this code block and respective phrase from STRINGS.in later

Trademark detection removed. It gave too many false positives.Code left because more experiences are needed about the consequences.

Definition at line 10009 of file parse.c.

int checkUnclassified ( char *  filetext,
int  size,
int  score,
int  isML,
int  isPS,
int  nw 
)

This function is called when all the above license-checks don't turn up anything useful. Now we need to determine if the current file likely contains a license or not.

Basic strategy is to look for 4 classes (groups) of words all within the same paragraph-or-two. Here we estimate the size of a paragaph to be 6 lines of legal text. To be conservative, we'll look for 6 contiguous lines ABOVE AND BELOW the line that matches our first search. In order words, we're using "grep -A6 -B6 pattern textfile".

A paragraph containing legal-VERBS, legal-DOCUMENTS, legal-NOUNS, and legal-PERMISSIONS are quite likely to be a license. This doesn't have to be 100% accurate but it IS nice to know whether a file that fails the known-license-footprints really contains a license or not. Knowing so makes the legal department's job easier.

Some text-files are determined by this function to contain some sort of license, but really only deal with the notion of a public-domain claim. If we find one here, report it; this way we don't bother calling the corner-case license-check function.

Definition at line 9533 of file parse.c.

void copyleftExceptions ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Find copyleft exceptions

Definition at line 11172 of file parse.c.

char * cplVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for CPL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 8323 of file parse.c.

static int dbgIdxGrep ( int  licTextIdx,
char *  buf,
int  show 
)
static

Debugging call for idxGrep()

Function calls idxGrep() and print the regex match using printRegexMatch()

Parameters
licTextIdxlicense index
buf
show
Returns
-1 on regex-compile failure, 1 if regex search fails, and 0 if regex search is successful.

Definition at line 328 of file parse.c.

static int fileHasPatt ( int  licTextIdx,
char *  filetext,
int  size,
int  isML,
int  isPS,
int  qType 
)
static

Checks for a phrase in a file.

Parameters
licTextIdxIndex of phrase to look
filetextContent of file
sizeFile size
isMLFile content is HTML/XML
isPSFile content is a post script
qType<0, look at raw text. >=0 look in doctored buffers
Returns
True if pattern found

Definition at line 260 of file parse.c.

int findPhrase ( int  index,
char *  filetext,
int  size,
int  isML,
int  isPS,
int  qType 
)

Check for the presence of a phrase in a file by first searching for the search key provided.

Cache the search results of, as we are very likely to be looking up the same word/phrase again.

Parameters
indexindex of the phrase to be searched for
filetextthe text to search
sizethe size of file
isMLIs HTML/XML file?
isPSIs postscript file?
qtype??
Returns
int ?? 0 means ??

Definition at line 8522 of file parse.c.

char * gfdlVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for GFDL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7982 of file parse.c.

char * gplVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)
static

Check for GPL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 8157 of file parse.c.

char * lgplVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for LGPL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 8048 of file parse.c.

void locateRegex ( char *  text,
item_t op,
int  index,
int  size,
int  sso,
int  seo 
)

Locate a regex in a given file.

Function first looks in raw text, then goes for doctored buffer if not found in the file.

Save location using saveRegexLocation()

Parameters
textRaw file to check
opList
indexIndex of regex
sizeSize of file
ssoMatch start
seoMatch end

Definition at line 9012 of file parse.c.

char * lpplVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for LPPL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7798 of file parse.c.

char * mplNplVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for MPL|NPL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7456 of file parse.c.

char * oslVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for OSL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7718 of file parse.c.

char* parseLicenses ( char *  filetext,
int  size,
scanres_t scp,
int  isML,
int  isPS 
)

Parse a file to check all the possible licenses and add them to matches.

The function calls fileHasPatt() if the file contains a pattern defined in STRINGS.in. If a match is found, then it can call idxGrep_recordIndex() to check if the file has some additional text and finally adds the license using addRef(). The results found are also stored in licStr as a comma separated list.

The function first check if a file contains an interesting string which can denote a license. If it is found then the heuristics are done in detail to find the exact license match. For more info please refer to nomos wiki

Parameters
filetextFile content
sizeFile size
[out]scpScan results
isMLSource is HTML/XML
isPSSource is PostScript
Returns
Next index in licStr

Definition at line 371 of file parse.c.

char * pythonVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

Check for python versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7623 of file parse.c.

char * realVersion ( char *  filetext,
int  size,
int  isML,
int  isPS,
int  ref 
)
static

Check for RPSL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return license shortname

Definition at line 7573 of file parse.c.

void saveRegexLocation ( int  index,
int  offset,
int  length,
int  saveCache 
)

Save a regex in whereList.

Parameters
indexIndex of the regex
offsetRegex match start
lengthRegex match length
saveCacheSet YES to save in whChacheList

Definition at line 9293 of file parse.c.

char * sisslVersion ( char *  filetext,
int  size,
int  isML,
int  isPS 
)
static

Check for SISSL versions.

Parameters
filetextFile content
sizeFile size
isMLFile is HTML/XML
isPSFile is PostScript
Returns
Return SISSL license shortname

Definition at line 7245 of file parse.c.

void spdxReference ( char *  filetext,
int  size,
int  isML,
int  isPS 
)

SPDX license references

Note that many license references have been detected earlier:

  • BSD, CNRI-Python, Intel and MIT variants,
  • CC_BY_SA, CECILL, MPL, ZPL, AGPL, GPL and LGPL versions
  • ICU, TCL

Definition at line 10296 of file parse.c.

Variable Documentation

struct { ... } kludge

Regex match related data

int lDebug = 0
static

set this to non-zero for more debugging

Definition at line 191 of file parse.c.

int lDiags = 0
static

set this to non-zero for printing diagnostics

Definition at line 192 of file parse.c.

char licStr[myBUFSIZ]
static

Detected licenses are stored here in a form ',BSD,MIT' etc

Definition at line 176 of file parse.c.

char ltsr[NFOOTPRINTS]
static

License Text Search Results, a bytemask for each possible match string

Definition at line 178 of file parse.c.

int pd
static

Flag for whether we've checked for a public domain "license"

Definition at line 187 of file parse.c.