FOSSology  3.2.0rc1
Open Source License Compliance by Open Source Software
doctorBuffer_utils.c
Go to the documentation of this file.
1 /***************************************************************
2  Copyright (C) 2006-2014 Hewlett-Packard Development Company, L.P.
3  Copyright (C) 2014, Siemens AG
4 
5  This program is free software; you can redistribute it and/or
6  modify it under the terms of the GNU General Public License
7  version 2 as published by the Free Software Foundation.
8 
9  This program is distributed in the hope that it will be useful,
10  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  GNU General Public License for more details.
13 
14  You should have received a copy of the GNU General Public License along
15  with this program; if not, write to the Free Software Foundation, Inc.,
16  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
17 
18  ***************************************************************/
19 #include "doctorBuffer_utils.h"
20 #define INVISIBLE (int) '\377'
21 
22 #include "nomos.h"
23 #include "list.h"
24 #include "util.h"
25 #include "nomos_regex.h"
26 
36 int compressDoctoredBuffer( char* textBuffer)
37 {
38  int previous=strlen(textBuffer);
39 
40  if(cur.docBufferPositionsAndOffsets)
41  g_array_free(cur.docBufferPositionsAndOffsets, TRUE);
42  cur.docBufferPositionsAndOffsets=collapseInvisible(textBuffer,INVISIBLE);
43 // cur.docBufferPositionsAndOffsets=collapseSpaces(textBuffer);
44 
45  int after = strlen(textBuffer);
46 
47  return previous - after;
48 }
49 
54 void removeHtmlComments(char* buf)
55 {
56  int f;
57  int g;
58  char* cp;
59  f = 0;
60  g = 0;
61  for (cp = buf; cp && *cp; cp++)
62  {
63  if ((*cp == '<') && (*(cp + 1) != '<') && (*(cp + 1) != ' '))
64  {
65 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
66  int x = strncasecmp(cp, "<string", 7);
67  printf("CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
68  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
69  *(cp+5), *(cp+6), x);
70 #endif /* DEBUG>5 && DOCTOR_DEBUG */
71  if (strncasecmp(cp, "<string", 7))
72  {
73  *cp = ' ';
74  if (*(cp + 1) != '-' || *(cp + 2) != '-')
75  {
76  f = 1;
77  }
78  }
79  }
80  else if (*cp == '&')
81  {
82 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
83  int x = strncasecmp(cp, "&copy;", 6);
84  printf("CHECK: %c%c%c%c%c%c == %d\n", *cp,
85  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
86  *(cp+5), x);
87 #endif /* DEBUG>5 && DOCTOR_DEBUG */
88  if (strncasecmp(cp, "&copy;", 6))
89  {
90  *cp = ' ';
91  g = 1;
92  }
93  }
94  else if (f && (*cp == '>'))
95  {
96  *cp = ' ';
97  f = 0;
98  }
99  else if (g && (*cp == ';'))
100  {
101  *cp = ' ';
102  g = 0;
103  }
104  else if (isEOL(*cp))
105  {
106  g = 0;
107  }
108  /* Don't remove text in an HTML comment (e.g., turn the flag off) */
109  else if ((*cp == '!') && f && (cp != buf) && (*(cp - 1) == ' '))
110  {
111  *cp = ' ';
112  f = 0;
113  }
114  else if (f || g)
115  {
116  // *cp = INVISIBLE; larry comment out this line, I do not think this logic is correct
117  }
118  else if ((*cp == '<') || (*cp == '>'))
119  {
120  *cp = ' ';
121  }
122  }
123 }
124 
131 void removeLineComments(char* buf)
132 {
133  char* cp;
134  char* MODULE_LICENSE = "MODULE_LICENSE";
135  cp = buf;
136  while (idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE | REG_NEWLINE | REG_EXTENDED))
137  {
138 #ifdef DOCTOR_DEBUG
139  dumpMatch(cp, "Found \"comment\"-text");
140 #endif /* DOCTOR_DEBUG */
141  cp += cur.regm.rm_so;
142  switch (*cp)
143  {
144  case '>':
145  *cp++ = ' ';
146  break;
147  case '@': /* texi special processing */
148  *cp++ = INVISIBLE;
149  if (strncasecmp(cp, "author", 6) == 0)
150  {
151  (void) memset(cp, ' ', 6);
152  cp += 6;
153  }
154  else if (strncasecmp(cp, "comment", 7) == 0)
155  {
156  (void) memset(cp, ' ', 7);
157  cp += 7;
158  }
159  else if (strncasecmp(cp, "center", 6) == 0)
160  {
161  (void) memset(cp, ' ', 6);
162  cp += 6;
163  }
164  else if (strncasecmp(cp, "rem", 3) == 0)
165  {
166  (void) memset(cp, ' ', 3);
167  cp += 3;
168  }
169  else if (*cp == 'c')
170  {
171  *cp++ = INVISIBLE;
172  if (strncasecmp(cp, " essay", 6) == 0)
173  {
174  (void) memset(cp, ' ', 6);
175  cp += 6;
176  }
177  }
178  break;
179  case '/': /* c++ style comment // */
180  if (cp && cp[0])
181  {
183  if (strstr(cp, MODULE_LICENSE) && '/' == cp[0])
184  {
185  (void) memset(cp, INVISIBLE, strlen(cp));
186  cp += strlen(cp);
187  }
188  else
189  {
190  (void) memset(cp, INVISIBLE, 2);
191  cp += 2;
192  }
193  }
194  break;
195  case '\\': /* c++ style comment // */
196  if (strncasecmp(cp + 1, "par ", 3) == 0)
197  {
198  (void) memset(cp, ' ', 4);
199  }
200  cp += 4;
201  break;
202  case 'r':
203  case 'R': /* rem */
204  case 'd':
205  case 'D': /* dnl */
206  (void) memset(cp, INVISIBLE, 3);
207  cp += 3;
208  break;
209  case 'x':
210  case 'X': /* xcomm */
211  (void) memset(cp, INVISIBLE, 5);
212  cp += 5;
213  break;
214  case 'c':
215  case 'C': /* comment */
216  (void) memset(cp, INVISIBLE, 7);
217  cp += 7;
218  break;
219  case '%': /* %%copyright: */
220  (void) memset(cp, INVISIBLE, 12);
221  cp += 12;
222  break;
223  }
224  }
225 }
226 
231 void cleanUpPostscript(char* buf)
232 {
233  char* cp;
234  char* x;
235  cp = buf;
236  while (idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED | REG_NEWLINE))
237  {
238 #ifdef DOCTOR_DEBUG
239  dumpMatch(cp, "FOUND postscript-thingy");
240 #endif /* DOCTOR_DEBUG */
241  x = cp + cur.regm.rm_so;
242  cp += cur.regm.rm_eo;
243  while (x < cp)
244  {
245  *x++ = ' '/*INVISIBLE*/;
246  }
247  }
248 }
249 
258 {
259  char* cp;
260  char* x;
261  for (cp = buf; *cp; cp++)
262  {
263  if (*cp == '\\')
264  {
265  x = cp + 1;
266  if (*x && (*x == 's'))
267  {
268  x++;
269  if (*x && ((*x == '+') || (*x == '-')))
270  {
271  x++;
272  }
273  while (*x && isdigit(*x))
274  {
275  x++;
276  }
277  }
278  else if (*x && *x == 'n')
279  {
280  x++;
281  }
282  memset(cp, /*INVISIBLE*/' ', (size_t) (x - cp));
283  }
284  }
285 }
286 
297 {
298  char* cp;
299  for (cp = buf; /*cp < end &&*/*cp; cp++)
300  {
301  if ((*cp == '\302') && (*(cp + 1) == '\251'))
302  {
303  cp += 1;
304  continue;
305  }
306  if (*cp & (char) 0x80)
307  {
308  *cp = INVISIBLE;
309  continue;
310  }
311  switch (*cp)
312  {
313  /*
314  Convert eol-characters AND some other miscellaneous
315  characters into spaces (due to comment-styles, etc.)
316  */
317  case '\a':
318  case '\t':
319  case '\n':
320  case '\r':
321  case '\v':
322  case '\f':
323  case '[':
324  case ']':
325  case '{':
326  case '}':
327  case '*':
328  case '=':
329  case '#':
330  case '$':
331  case '|':
332  case '%':
333  case '!':
334  case '?':
335  case '`':
336  case '"':
337  case '\'':
338  *cp = ' ';
339  break;
340  /* allow + only within the regex " [Mm]\+ " */
341  case '+':
342  if (*(cp + 1) == 0 || *(cp + 1) == ' ' || *(cp + 1) == '\t' || *(cp + 1) == '\n' || *(cp + 1) == '\r')
343  break;
344  else if (cp > buf + 1 && (*(cp - 1) == 'M' || *(cp - 1) == 'm') && *(cp - 2) == ' ' && *(cp + 1) == ' ')
345  {
346  /* no-op */
347  }
348  else
349  {
350  *cp = ' ';
351  }
352  break;
353  case '(':
354  if ((*(cp + 1) == 'C' || *(cp + 1) == 'c') && *(cp + 2) == ')')
355  {
356  cp += 2;
357  continue;
358  }
359  else
360  {
361  *cp = ' ';
362  }
363  break;
364  case ')':
365  case ',':
366  case ':':
367  case ';':
368  if (!isCR)
369  {
370  *cp = ' ';
371  }
372  break;
373  case '.':
374  if (!isCR)
375  {
376  *cp = INVISIBLE;
377  }
378  break;
379  case '<':
380  if (strncasecmp(cp, "<string", 7) == 0)
381  {
382  (void) strncpy(cp, " ", 7);
383  }
384  break;
385  /* CDB - Big #ifdef 0 left out */
386  case '\001':
387  case '\002':
388  case '\003':
389  case '\004':
390  case '\005':
391  case '\006':
392  case '\016':
393  case '\017':
394  case '\020':
395  case '\021':
396  case '\022':
397  case '\023':
398  case '\024':
399  case '\025':
400  case '\026':
401  case '\027':
402  case '\030':
403  case '\031':
404  case '\032':
405  case '\033':
406  case '\034':
407  case '\035':
408  case '\036':
409  case '\037':
410  case '~':
411  *cp = INVISIBLE;
412  break;
413 #ifdef DOCTOR_DEBUG
414  case ' ': case '/': case '-': case '@': case '&':
415  case '>': case '^': case '_':
416  case INVISIBLE:
417  break;
418  default:
419  if (!isalpha(*cp) && !isdigit(*cp))
420  {
421  printf("DEBUG: \\0%o @ %ld\n",
422  *cp & 0xff, cp-buf);
423  }
424  break;
425 #endif /* DOCTOR_DEBUG */
426  }
427  }
428 }
429 
437 void dehyphen(char* buf)
438 {
439  char* cp;
440 
441  for (cp = buf; idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); /*nada*/)
442  {
443 #ifdef DOCTOR_DEBUG
444  char* x;
445  x = cp + cur.regm.rm_so;
446  while ((x > cp) && !isspace(*x))
447  {
448  x--;
449  }
450  printf("Hey! hyphenated-word [");
451  for (++x; x <= (cp + cur.regm.rm_eo); x++)
452  {
453  printf("%c", *x);
454  }
455  while (!isspace(*x))
456  {
457  printf("%c", *x++);
458  }
459  printf("]\n");
460 
461 #endif /* DOCTOR_DEBUG */
462  cp += cur.regm.rm_so + 1;
463  *cp++ = INVISIBLE;
464  while (isspace(*cp))
465  {
466  *cp++ = INVISIBLE;
467  }
468  }
469 
470 }
471 
478 void removePunctuation(char* buf)
479 {
480  char* cp;
481  char* x;
482  for (cp = buf; idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); /*nada*/)
483  {
484  x = cp + cur.regm.rm_so;
485  cp += cur.regm.rm_eo - 1; /* leave ' ' alone */
486  while (x < cp)
487  {
488  *x++ = ' ';
489  }
490  cp++;
491  }
492  for (cp = buf; idxGrep(_UTIL_LATEX, cp, REG_ICASE); /*nada*/)
493  {
494  x = cp + cur.regm.rm_so;
495  cp += cur.regm.rm_eo;
496  while (x <= cp)
497  {
498  *x++ = ' ';
499  }
500  cp++;
501  }
502 }
503 
516 void ignoreFunctionCalls(char* buf)
517 {
518  char* cp;
519  char* x;
520  for (cp = buf; idxGrep(_UTIL_PRINT, cp, REG_ICASE); /*nada*/)
521  {
522  x = cp + cur.regm.rm_so;
523  cp += (cur.regm.rm_eo - 1);
524  if ((x > buf) && ((*(x - 1) == 'r') || (*(x - 1) == 't')))
525  {
526  continue;
527  }
528  while (x < cp)
529  {
530  *x++ = ' ';
531  }
532  cp++;
533  }
534 }
535 
541 void convertSpaceToInvisible(char* buf)
542 {
543  char* cp;
544  for (cp = buf; *cp; /*nada*/)
545  {
546  if (*cp++ == ' ')
547  {
548  while (*cp)
549  {
550  if (*cp == ' ')
551  {
552  *cp++ = INVISIBLE;
553  }
554  else if (*cp == INVISIBLE)
555  {
556  cp++;
557  }
558  else
559  {
560  break;
561  }
562  }
563  }
564  }
565 }
566 
586 void doctorBuffer(char *buf, int isML, int isPS, int isCR)
587 {
588 
589 // printf("\n ==============doctorBuffer is called============================== \n");
590 
591  // char *cp;
592  // char *x;
593 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG)
594  traceFunc("== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
595 #endif /* PROC_TRACE || DOCTOR_DEBUG */
596 
597  /*
598  * convert a buffer of multiple *stuff* to text-only, separated by spaces
599  * We really only care about text "in a license" here, so strip out
600  * comments and other unwanted punctuation.
601  */
602 #ifdef DOCTOR_DEBUG
603  printf("***** Processing %p (%d data bytes)\n", buf, (int)strlen(buf));
604  printf("----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
605 #endif /* DOCTOR_DEBUG */
606  /*
607  * step 1: take care of embedded HTML/XML and special HTML-chars like
608  * &quot; and &nbsp; -- but DON'T remove the text in an HTML comment.
609  * There might be licensing text/information in the comment!
610  *****
611  * Later on (in parseLicenses()) we search for URLs in the raw-text
612  */
613  if (isML)
614  {
615 #ifdef DOCTOR_DEBUG
616  printf("DEBUG: markup-languange directives found!\n");
617 #endif /* DOCTOR_DEBUG */
618  removeHtmlComments(buf);
619  }
620  /*
621  * step 2: remove comments that start at the beginning of a line, * like
622  * ^dnl, ^xcomm, ^comment, and //
623  */
624  removeLineComments(buf);
625  /*
626  * Step 3 - strip out crap at end-of-line on postscript documents
627  */
628 
629  if (isPS)
630  {
631  cleanUpPostscript(buf);
632 #ifdef DOCTOR_DEBUG
633  printf("DEBUG: postscript stuff detected!\n");
634 #endif /* DOCTOR_DEBUG */
635  }
636  /*
637  * - step 4: remove groff/troff font-size indicators, the literal
638  * string backslash-n and all backslahes, ala:
639  *==> perl -pe 's,\\s[+-][0-9]*,,g;s,\\s[0-9]*,,g;s/\\n//g;' |
640  f*/
642  /*
643  * - step 5: convert white-space to real spaces, and remove
644  * unnecessary punctuation, ala:
645  *==> tr -d '*=+#$|%.,:;!?()\\][\140\047\042' | tr '\011\012\015' ' '
646  *****
647  * NOTE: we purposely do NOT process backspace-characters here. Perhaps
648  * there's an improvement in the wings for this?
649  */
651  /*
652  * Look for hyphenations of words, to compress both halves into a sin-
653  * gle (sic) word. Regex == "[a-z]- [a-z]".
654  *****
655  * NOTE: not sure this will work based on the way we strip punctuation
656  * out of the buffer above -- work on this later.
657  */
658  dehyphen(buf);
659  /*
660  * - step 6: clean up miscellaneous punctuation, ala:
661  *==> perl -pe 's,[-_/]+ , ,g;s/print[_a-zA-Z]* //g;s/ / /g;'
662  */
663  removePunctuation(buf);
664  /*
665  * Ignore function calls to print routines: only concentrate on what's being
666  * printed (sometimes programs do print licensing information) -- but don't
667  * ignore real words that END in 'print', like footprint and fingerprint.
668  * Here, we take a risk and just look for a 't' (in "footprint"), or for an
669  * 'r' (in "fingerprint"). If someone has ever coded a print routine that
670  * is named 'rprint' or tprint', we're spoofed.
671  */
672  ignoreFunctionCalls(buf);
673  /*
674  * Convert the regex ' [X ]+' (where X is really the character #defined as
675  * INVISIBLE) to a single space (and a string of INVISIBLE characters).
676  */
678  /*
679  * garbage collect: eliminate all INVISIBLE characters in the buffer
680  */
681 #ifdef DOCTOR_DEBUG
682  int n =
683 #else
684  (void)
685 #endif
687 
688 #ifdef DOCTOR_DEBUG
689  printf("***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
690  (int)strlen(buf), n);
691  printf("+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
692 #endif /* DOCTOR_DEBUG */
693  return;
694 }
695 
696 #ifdef DOCTORBUFFER_OLD
697 void doctorBuffer_old(char *buf, int isML, int isPS, int isCR)
698 {
699  printf("Doctor Buffer old \n");
700  char *cp;
701  char *x;
702  int f;
703  int g;
704  int n;
705  char *MODULE_LICENSE = "MODULE_LICENSE";
706 
707 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG)
708  traceFunc("== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
709 #endif /* PROC_TRACE || DOCTOR_DEBUG */
710 
711  /*
712  * convert a buffer of multiple *stuff* to text-only, separated by spaces
713  * We really only care about text "in a license" here, so strip out
714  * comments and other unwanted punctuation.
715  */
716 #ifdef DOCTOR_DEBUG
717  printf("***** Processing %p (%d data bytes)\n", buf, (int)strlen(buf));
718  printf("----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
719 #endif /* DOCTOR_DEBUG */
720  /*
721  * step 1: take care of embedded HTML/XML and special HTML-chars like
722  * &quot; and &nbsp; -- but DON'T remove the text in an HTML comment.
723  * There might be licensing text/information in the comment!
724  *****
725  * Later on (in parseLicenses()) we search for URLs in the raw-text
726  */
727  if (isML) {
728 #ifdef DOCTOR_DEBUG
729  printf("DEBUG: markup-languange directives found!\n");
730 #endif /* DOCTOR_DEBUG */
731  f = 0;
732  g = 0;
733  for (cp = buf; cp && *cp; cp++) {
734  if ((*cp == '<') &&
735  (*(cp+1) != '<') &&
736  (*(cp+1) != ' ')) {
737 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
738  int x = strncasecmp(cp, "<string", 7);
739  printf("CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
740  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
741  *(cp+5), *(cp+6), x);
742 #endif /* DEBUG>5 && DOCTOR_DEBUG */
743  if (strncasecmp(cp, "<string", 7)) {
744  *cp = ' ';
745  if (*(cp+1) != '-' || *(cp+2) != '-') {
746  f = 1;
747  }
748  }
749  } else if (*cp == '&') {
750 #if (DEBUG>5) && defined(DOCTOR_DEBUG)
751  int x = strncasecmp(cp, "&copy;", 6);
752  printf("CHECK: %c%c%c%c%c%c == %d\n", *cp,
753  *(cp+1), *(cp+2), *(cp+3), *(cp+4),
754  *(cp+5), x);
755 #endif /* DEBUG>5 && DOCTOR_DEBUG */
756  if (strncasecmp(cp, "&copy;", 6)) {
757  *cp = ' ';
758  g = 1;
759  }
760  } else if (f && (*cp == '>')) {
761  *cp = ' ';
762  f = 0;
763  } else if (g && (*cp == ';')) {
764  *cp = ' ';
765  g = 0;
766  } else if (isEOL(*cp)) {
767  g = 0;
768  }
769  /* Don't remove text in an HTML comment (e.g., turn the flag off) */
770  else if ((*cp == '!') &&
771  f &&
772  (cp != buf) &&
773  (*(cp-1) == ' ')) {
774  *cp = ' ';
775  f = 0;
776  } else if (f || g) {
777  // *cp = INVISIBLE; larry comment out this line, I do not think this logic is correct
778  } else if ((*cp == '<') || (*cp == '>')) {
779  *cp = ' ';
780  }
781  }
782  }
783  /*
784  * step 2: remove comments that start at the beginning of a line, * like
785  * ^dnl, ^xcomm, ^comment, and //
786  */
787  cp = buf;
788  while (idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE|REG_NEWLINE|REG_EXTENDED)) {
789 #ifdef DOCTOR_DEBUG
790  dumpMatch(cp, "Found \"comment\"-text");
791 #endif /* DOCTOR_DEBUG */
792  cp += cur.regm.rm_so;
793  switch (*cp) {
794  case '>':
795  *cp++ = ' ';
796  break;
797  case '@': /* texi special processing */
798  *cp++ = INVISIBLE;
799  if (strncasecmp(cp, "author", 6) == 0) {
800  (void) memset(cp, ' ', 6);
801  cp += 6;
802  } else if (strncasecmp(cp, "comment", 7) == 0) {
803  (void) memset(cp, ' ', 7);
804  cp += 7;
805  } else if (strncasecmp(cp, "center", 6) == 0) {
806  (void) memset(cp, ' ', 6);
807  cp += 6;
808  }
809  else if (strncasecmp(cp, "rem", 3) == 0) {
810  (void) memset(cp, ' ', 3);
811  cp += 3;
812  } else if (*cp == 'c') {
813  *cp++ = INVISIBLE;
814  if (strncasecmp(cp, " essay", 6) == 0) {
815  (void) memset(cp, ' ', 6);
816  cp += 6;
817  }
818  }
819  break;
820  case '/': /* c++ style comment // */
821  if(cp && cp[0])
822  {
824  if (strstr(cp, MODULE_LICENSE) && '/' == cp[0])
825  {
826  (void) memset(cp, INVISIBLE, strlen(cp));
827  cp += strlen(cp);
828  }
829  else {
830  (void) memset(cp, INVISIBLE, 2);
831  cp += 2;
832  }
833  }
834  break;
835  case '\\': /* c++ style comment // */
836  if (strncasecmp(cp+1, "par ", 3) == 0) {
837  (void) memset(cp, ' ', 4);
838  }
839  cp += 4;
840  break;
841  case 'r':
842  case 'R': /* rem */
843  case 'd':
844  case 'D': /* dnl */
845  (void) memset(cp, INVISIBLE, 3);
846  cp += 3;
847  break;
848  case 'x':
849  case 'X': /* xcomm */
850  (void) memset(cp, INVISIBLE, 5);
851  cp += 5;
852  break;
853  case 'c':
854  case 'C': /* comment */
855  (void) memset(cp, INVISIBLE, 7);
856  cp += 7;
857  break;
858  case '%': /* %%copyright: */
859  (void) memset(cp, INVISIBLE, 12);
860  cp += 12;
861  break;
862  }
863  }
864  /*
865  * Step 3 - strip out crap at end-of-line on postscript documents
866  */
867  if (isPS) {
868 #ifdef DOCTOR_DEBUG
869  printf("DEBUG: postscript stuff detected!\n");
870 #endif /* DOCTOR_DEBUG */
871  cp = buf;
872  while (idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED|REG_NEWLINE)) {
873 #ifdef DOCTOR_DEBUG
874  dumpMatch(cp, "FOUND postscript-thingy");
875 #endif /* DOCTOR_DEBUG */
876  x = cp + cur.regm.rm_so;
877  cp += cur.regm.rm_eo;
878  while (x < cp) {
879  *x++ = ' '/*INVISIBLE*/;
880  }
881  }
882  }
883  /*
884  * - step 4: remove groff/troff font-size indicators, the literal
885  * string backslash-n and all backslahes, ala:
886  *==> perl -pe 's,\\s[+-][0-9]*,,g;s,\\s[0-9]*,,g;s/\\n//g;' |
887  f*/
888  for (cp = buf; *cp; cp++) {
889  if (*cp == '\\') {
890  x = cp + 1;
891  if (*x && (*x == 's')) {
892  x++;
893  if (*x && ((*x == '+') || (*x == '-'))) {
894  x++;
895  }
896  while (*x && isdigit(*x)) {
897  x++;
898  }
899  } else if (*x && *x == 'n') {
900  x++;
901  }
902  memset(cp, /*INVISIBLE*/' ', (size_t) (x-cp));
903  }
904  }
905  /*
906  * - step 5: convert white-space to real spaces, and remove
907  * unnecessary punctuation, ala:
908  *==> tr -d '*=+#$|%.,:;!?()\\][\140\047\042' | tr '\011\012\015' ' '
909  *****
910  * NOTE: we purposely do NOT process backspace-characters here. Perhaps
911  * there's an improvement in the wings for this?
912  */
913  for (cp = buf; /*cp < end &&*/ *cp; cp++) {
914  if ((*cp == '\302') && (*(cp+1) == '\251')) {
915  cp += 2;
916  continue;
917  }
918  if (*cp & (char) 0x80) {
919  *cp = INVISIBLE;
920  continue;
921  }
922  switch (*cp) {
923  /*
924  Convert eol-characters AND some other miscellaneous
925  characters into spaces (due to comment-styles, etc.)
926  */
927  case '\a': case '\t': case '\n': case '\r':
928  case '\v': case '\f': case '[': case ']':
929  case '{': case '}': case '*': case '=':
930  case '#': case '$': case '|': case '%': case '!':
931  case '?': case '`': case '"': case '\'':
932  *cp = ' ';
933  break;
934  /* allow + only within the regex " [Mm]\+ " */
935  case '+':
936  if (*(cp+1) == 0 || *(cp+1) == ' ' || *(cp+1) == '\t' || *(cp+1) == '\n' || *(cp+1) == '\r') break;
937  else if (cp > buf+1 && (*(cp-1) == 'M' ||
938  *(cp-1) == 'm') && *(cp-2) == ' ' &&
939  *(cp+1) == ' ') {
940  f = 0; /* no-op */
941  }
942  else {
943  *cp = ' ';
944  }
945  break;
946  case '(':
947  if ((*(cp+1) == 'C' || *(cp+1) == 'c') &&
948  *(cp+2) == ')') {
949  cp += 2;
950  continue;
951  }
952  else {
953  *cp = ' ';
954  }
955  break;
956  case ')': case ',': case ':': case ';':
957  if (!isCR) {
958  *cp = ' ';
959  }
960  break;
961  case '.':
962  if (!isCR) {
963  *cp = INVISIBLE;
964  }
965  break;
966  case '<':
967  if (strncasecmp(cp, "<string", 7) == 0) {
968  (void) strncpy(cp, " ", 7);
969  }
970  break;
971  /* CDB - Big #ifdef 0 left out */
972  case '\001': case '\002': case '\003': case '\004':
973  case '\005': case '\006': case '\016': case '\017':
974  case '\020': case '\021': case '\022': case '\023':
975  case '\024': case '\025': case '\026': case '\027':
976  case '\030': case '\031': case '\032': case '\033':
977  case '\034': case '\035': case '\036': case '\037':
978  case '~':
979  *cp = INVISIBLE;
980  break;
981 #ifdef DOCTOR_DEBUG
982  case ' ': case '/': case '-': case '@': case '&':
983  case '>': case '^': case '_':
984  case INVISIBLE:
985  break;
986  default:
987  if (!isalpha(*cp) && !isdigit(*cp)) {
988  printf("DEBUG: \\0%o @ %ld\n",
989  *cp & 0xff, cp-buf);
990  }
991  break;
992 #endif /* DOCTOR_DEBUG */
993  }
994  }
995  /*
996  * Look for hyphenations of words, to compress both halves into a sin-
997  * gle (sic) word. Regex == "[a-z]- [a-z]".
998  *****
999  * NOTE: not sure this will work based on the way we strip punctuation
1000  * out of the buffer above -- work on this later.
1001  */
1002  for (cp = buf; idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); /*nada*/) {
1003 #ifdef DOCTOR_DEBUG
1004  x = cp + cur.regm.rm_so;
1005  while ((x > cp) && !isspace(*x)) {
1006  x--;
1007  }
1008  printf("Hey! hyphenated-word [");
1009  for (++x; x <= (cp + cur.regm.rm_eo); x++) {
1010  printf("%c", *x);
1011  }
1012  while (!isspace(*x)) {
1013  printf("%c", *x++);
1014  }
1015  printf("]\n");
1016 
1017 #endif /* DOCTOR_DEBUG */
1018  cp += cur.regm.rm_so + 1;
1019  *cp++ = INVISIBLE;
1020  while (isspace(*cp)) {
1021  *cp++ = INVISIBLE;
1022  }
1023  }
1024  /*
1025  * - step 6: clean up miscellaneous punctuation, ala:
1026  *==> perl -pe 's,[-_/]+ , ,g;s/print[_a-zA-Z]* //g;s/ / /g;'
1027  */
1028  for (cp = buf; idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); /*nada*/) {
1029  x = cp + cur.regm.rm_so;
1030  cp += cur.regm.rm_eo - 1; /* leave ' ' alone */
1031  while (x < cp) {
1032  *x++ = ' ';
1033  }
1034  cp++;
1035  }
1036  for (cp = buf; idxGrep(_UTIL_LATEX, cp, REG_ICASE); /*nada*/) {
1037  x = cp + cur.regm.rm_so;
1038  cp += cur.regm.rm_eo;
1039  while (x <= cp) {
1040  *x++ = ' ';
1041  }
1042  cp++;
1043  }
1044  /*
1045  * Ignore function calls to print routines: only concentrate on what's being
1046  * printed (sometimes programs do print licensing information) -- but don't
1047  * ignore real words that END in 'print', like footprint and fingerprint.
1048  * Here, we take a risk and just look for a 't' (in "footprint"), or for an
1049  * 'r' (in "fingerprint"). If someone has ever coded a print routine that
1050  * is named 'rprint' or tprint', we're spoofed.
1051  */
1052  for (cp = buf; idxGrep(_UTIL_PRINT, cp, REG_ICASE); /*nada*/) {
1053  x = cp + cur.regm.rm_so;
1054  cp += (cur.regm.rm_eo - 1);
1055  if ((x > buf) && ((*(x-1) == 'r') || (*(x-1) == 't'))) {
1056  continue;
1057  }
1058  while (x < cp) {
1059  *x++ = ' ';
1060  }
1061  cp++;
1062  }
1063  /*
1064  * Convert the regex ' [X ]+' (where X is really the character #defined as
1065  * INVISIBLE) to a single space (and a string of INVISIBLE characters).
1066  */
1067  for (cp = buf; *cp; /*nada*/) {
1068  if (*cp++ == ' ') {
1069  while (*cp) {
1070  if (*cp == ' ') {
1071  *cp++ = INVISIBLE;
1072  } else if (*cp == INVISIBLE) {
1073  cp++;
1074  } else {
1075  break;
1076  }
1077  }
1078  }
1079  }
1080  /*
1081  * garbage collect: eliminate all INVISIBLE characters in the buffer
1082  */
1083  x = cp = buf;
1084  n = 0;
1085  while (/*cp < end &&*/ *cp) {
1086  while (/*cp < end &&*/ *cp == INVISIBLE) {
1087  n++;
1088  cp++;
1089  }
1090  if (*cp) {
1091  *x++ = *cp++;
1092  }
1093  }
1094  *x = NULL_CHAR;
1095 #ifdef DOCTOR_DEBUG
1096  printf("***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
1097  (int)strlen(buf), n);
1098  printf("+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
1099 #endif /* DOCTOR_DEBUG */
1100  return;
1101 }
1102 #endif
1103 
1104 
1105 
void convertSpaceToInvisible(char *buf)
void removeBackslashesAndGTroffIndicators(char *buf)
Remove groff/troff font-size indicators, the literal string backslash-n and all backslahes, ala.
void doctorBuffer(char *buf, int isML, int isPS, int isCR)
Convert a buffer of multiple stuff to text-only, separated by spaces.
GArray * collapseInvisible(char *text, char invisible)
Definition: nomos_gap.c:30
int compressDoctoredBuffer(char *textBuffer)
garbage collect: eliminate all INVISIBLE characters in the buffer
#define NULL_CHAR
NULL character.
Definition: nomos.h:247
void ignoreFunctionCalls(char *buf)
Ignore function calls to print routines.
void dehyphen(char *buf)
void convertWhitespaceToSpaceAndRemoveSpecialChars(char *buf, int isCR)
Convert white-space to real spaces, and remove unnecessary punctuation.
int idxGrep(int index, char *data, int flags)
compile a regex, and perform the search (on data?)
Definition: nomos_regex.c:216
#define isEOL(x)
Check if x points to a EOL character.
Definition: nomos.h:253
void removePunctuation(char *buf)
Clean up miscellaneous punctuation.
void removeLineComments(char *buf)
Remove comments that start at the beginning of a line.
void cleanUpPostscript(char *buf)
Remove newlines from buffer.
void removeHtmlComments(char *buf)
Remove HTML comments from buffer without removing comment text.
Nomos header file.