19 #include "doctorBuffer_utils.h" 20 #define INVISIBLE (int) '\377' 25 #include "nomos_regex.h" 38 int previous=strlen(textBuffer);
40 if(cur.docBufferPositionsAndOffsets)
41 g_array_free(cur.docBufferPositionsAndOffsets, TRUE);
45 int after = strlen(textBuffer);
47 return previous - after;
61 for (cp = buf; cp && *cp; cp++)
63 if ((*cp ==
'<') && (*(cp + 1) !=
'<') && (*(cp + 1) !=
' '))
65 #if (DEBUG>5) && defined(DOCTOR_DEBUG) 66 int x = strncasecmp(cp,
"<string", 7);
67 printf(
"CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
68 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
71 if (strncasecmp(cp,
"<string", 7))
74 if (*(cp + 1) !=
'-' || *(cp + 2) !=
'-')
82 #if (DEBUG>5) && defined(DOCTOR_DEBUG) 83 int x = strncasecmp(cp,
"©", 6);
84 printf(
"CHECK: %c%c%c%c%c%c == %d\n", *cp,
85 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
88 if (strncasecmp(cp,
"©", 6))
94 else if (f && (*cp ==
'>'))
99 else if (g && (*cp ==
';'))
109 else if ((*cp ==
'!') && f && (cp != buf) && (*(cp - 1) ==
' '))
118 else if ((*cp ==
'<') || (*cp ==
'>'))
134 char* MODULE_LICENSE =
"MODULE_LICENSE";
136 while (
idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE | REG_NEWLINE | REG_EXTENDED))
139 dumpMatch(cp,
"Found \"comment\"-text");
141 cp += cur.regm.rm_so;
149 if (strncasecmp(cp,
"author", 6) == 0)
151 (void) memset(cp,
' ', 6);
154 else if (strncasecmp(cp,
"comment", 7) == 0)
156 (void) memset(cp,
' ', 7);
159 else if (strncasecmp(cp,
"center", 6) == 0)
161 (void) memset(cp,
' ', 6);
164 else if (strncasecmp(cp,
"rem", 3) == 0)
166 (void) memset(cp,
' ', 3);
172 if (strncasecmp(cp,
" essay", 6) == 0)
174 (void) memset(cp,
' ', 6);
183 if (strstr(cp, MODULE_LICENSE) &&
'/' == cp[0])
185 (void) memset(cp, INVISIBLE, strlen(cp));
190 (void) memset(cp, INVISIBLE, 2);
196 if (strncasecmp(cp + 1,
"par ", 3) == 0)
198 (void) memset(cp,
' ', 4);
206 (void) memset(cp, INVISIBLE, 3);
211 (void) memset(cp, INVISIBLE, 5);
216 (void) memset(cp, INVISIBLE, 7);
220 (void) memset(cp, INVISIBLE, 12);
236 while (
idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED | REG_NEWLINE))
239 dumpMatch(cp,
"FOUND postscript-thingy");
241 x = cp + cur.regm.rm_so;
242 cp += cur.regm.rm_eo;
261 for (cp = buf; *cp; cp++)
266 if (*x && (*x ==
's'))
269 if (*x && ((*x ==
'+') || (*x ==
'-')))
273 while (*x && isdigit(*x))
278 else if (*x && *x ==
'n')
282 memset(cp,
' ', (
size_t) (x - cp));
299 for (cp = buf; *cp; cp++)
301 if ((*cp ==
'\302') && (*(cp + 1) ==
'\251'))
306 if (*cp & (
char) 0x80)
342 if (*(cp + 1) == 0 || *(cp + 1) ==
' ' || *(cp + 1) ==
'\t' || *(cp + 1) ==
'\n' || *(cp + 1) ==
'\r')
344 else if (cp > buf + 1 && (*(cp - 1) ==
'M' || *(cp - 1) ==
'm') && *(cp - 2) ==
' ' && *(cp + 1) ==
' ')
354 if ((*(cp + 1) ==
'C' || *(cp + 1) ==
'c') && *(cp + 2) ==
')')
380 if (strncasecmp(cp,
"<string", 7) == 0)
382 (void) strncpy(cp,
" ", 7);
414 case ' ':
case '/':
case '-':
case '@':
case '&':
415 case '>':
case '^':
case '_':
419 if (!isalpha(*cp) && !isdigit(*cp))
421 printf(
"DEBUG: \\0%o @ %ld\n",
441 for (cp = buf;
idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); )
445 x = cp + cur.regm.rm_so;
446 while ((x > cp) && !isspace(*x))
450 printf(
"Hey! hyphenated-word [");
451 for (++x; x <= (cp + cur.regm.rm_eo); x++)
462 cp += cur.regm.rm_so + 1;
482 for (cp = buf;
idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); )
484 x = cp + cur.regm.rm_so;
485 cp += cur.regm.rm_eo - 1;
492 for (cp = buf;
idxGrep(_UTIL_LATEX, cp, REG_ICASE); )
494 x = cp + cur.regm.rm_so;
495 cp += cur.regm.rm_eo;
520 for (cp = buf;
idxGrep(_UTIL_PRINT, cp, REG_ICASE); )
522 x = cp + cur.regm.rm_so;
523 cp += (cur.regm.rm_eo - 1);
524 if ((x > buf) && ((*(x - 1) ==
'r') || (*(x - 1) ==
't')))
544 for (cp = buf; *cp; )
554 else if (*cp == INVISIBLE)
593 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG) 594 traceFunc(
"== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
603 printf(
"***** Processing %p (%d data bytes)\n", buf, (
int)strlen(buf));
604 printf(
"----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
616 printf(
"DEBUG: markup-languange directives found!\n");
633 printf(
"DEBUG: postscript stuff detected!\n");
689 printf(
"***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
690 (
int)strlen(buf), n);
691 printf(
"+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
696 #ifdef DOCTORBUFFER_OLD 697 void doctorBuffer_old(
char *buf,
int isML,
int isPS,
int isCR)
699 printf(
"Doctor Buffer old \n");
705 char *MODULE_LICENSE =
"MODULE_LICENSE";
707 #if defined(PROC_TRACE) || defined(DOCTOR_DEBUG) 708 traceFunc(
"== doctorBuffer(%p, %d, %d, %d)\n", buf, isML, isPS, isCR);
717 printf(
"***** Processing %p (%d data bytes)\n", buf, (
int)strlen(buf));
718 printf(
"----- [Dr-BEFORE:] -----\n%s\n[==END==]\n", buf);
729 printf(
"DEBUG: markup-languange directives found!\n");
733 for (cp = buf; cp && *cp; cp++) {
737 #if (DEBUG>5) && defined(DOCTOR_DEBUG) 738 int x = strncasecmp(cp,
"<string", 7);
739 printf(
"CHECK: %c%c%c%c%c%c%c == %d\n", *cp,
740 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
741 *(cp+5), *(cp+6), x);
743 if (strncasecmp(cp,
"<string", 7)) {
745 if (*(cp+1) !=
'-' || *(cp+2) !=
'-') {
749 }
else if (*cp ==
'&') {
750 #if (DEBUG>5) && defined(DOCTOR_DEBUG) 751 int x = strncasecmp(cp,
"©", 6);
752 printf(
"CHECK: %c%c%c%c%c%c == %d\n", *cp,
753 *(cp+1), *(cp+2), *(cp+3), *(cp+4),
756 if (strncasecmp(cp,
"©", 6)) {
760 }
else if (f && (*cp ==
'>')) {
763 }
else if (g && (*cp ==
';')) {
766 }
else if (
isEOL(*cp)) {
770 else if ((*cp ==
'!') &&
778 }
else if ((*cp ==
'<') || (*cp ==
'>')) {
788 while (
idxGrep(_UTIL_BOL_MAGIC, cp, REG_ICASE|REG_NEWLINE|REG_EXTENDED)) {
790 dumpMatch(cp,
"Found \"comment\"-text");
792 cp += cur.regm.rm_so;
799 if (strncasecmp(cp,
"author", 6) == 0) {
800 (void) memset(cp,
' ', 6);
802 }
else if (strncasecmp(cp,
"comment", 7) == 0) {
803 (void) memset(cp,
' ', 7);
805 }
else if (strncasecmp(cp,
"center", 6) == 0) {
806 (void) memset(cp,
' ', 6);
809 else if (strncasecmp(cp,
"rem", 3) == 0) {
810 (void) memset(cp,
' ', 3);
812 }
else if (*cp ==
'c') {
814 if (strncasecmp(cp,
" essay", 6) == 0) {
815 (void) memset(cp,
' ', 6);
824 if (strstr(cp, MODULE_LICENSE) &&
'/' == cp[0])
826 (void) memset(cp, INVISIBLE, strlen(cp));
830 (void) memset(cp, INVISIBLE, 2);
836 if (strncasecmp(cp+1,
"par ", 3) == 0) {
837 (void) memset(cp,
' ', 4);
845 (void) memset(cp, INVISIBLE, 3);
850 (void) memset(cp, INVISIBLE, 5);
855 (void) memset(cp, INVISIBLE, 7);
859 (void) memset(cp, INVISIBLE, 12);
869 printf(
"DEBUG: postscript stuff detected!\n");
872 while (
idxGrep(_UTIL_POSTSCR, cp, REG_EXTENDED|REG_NEWLINE)) {
874 dumpMatch(cp,
"FOUND postscript-thingy");
876 x = cp + cur.regm.rm_so;
877 cp += cur.regm.rm_eo;
888 for (cp = buf; *cp; cp++) {
891 if (*x && (*x ==
's')) {
893 if (*x && ((*x ==
'+') || (*x ==
'-'))) {
896 while (*x && isdigit(*x)) {
899 }
else if (*x && *x ==
'n') {
902 memset(cp,
' ', (
size_t) (x-cp));
913 for (cp = buf; *cp; cp++) {
914 if ((*cp ==
'\302') && (*(cp+1) ==
'\251')) {
918 if (*cp & (
char) 0x80) {
927 case '\a':
case '\t':
case '\n':
case '\r':
928 case '\v':
case '\f':
case '[':
case ']':
929 case '{':
case '}':
case '*':
case '=':
930 case '#':
case '$':
case '|':
case '%':
case '!':
931 case '?':
case '`':
case '"':
case '\'':
936 if (*(cp+1) == 0 || *(cp+1) ==
' ' || *(cp+1) ==
'\t' || *(cp+1) ==
'\n' || *(cp+1) ==
'\r')
break;
937 else if (cp > buf+1 && (*(cp-1) ==
'M' ||
938 *(cp-1) ==
'm') && *(cp-2) ==
' ' &&
947 if ((*(cp+1) ==
'C' || *(cp+1) ==
'c') &&
956 case ')':
case ',':
case ':':
case ';':
967 if (strncasecmp(cp,
"<string", 7) == 0) {
968 (void) strncpy(cp,
" ", 7);
972 case '\001':
case '\002':
case '\003':
case '\004':
973 case '\005':
case '\006':
case '\016':
case '\017':
974 case '\020':
case '\021':
case '\022':
case '\023':
975 case '\024':
case '\025':
case '\026':
case '\027':
976 case '\030':
case '\031':
case '\032':
case '\033':
977 case '\034':
case '\035':
case '\036':
case '\037':
982 case ' ':
case '/':
case '-':
case '@':
case '&':
983 case '>':
case '^':
case '_':
987 if (!isalpha(*cp) && !isdigit(*cp)) {
988 printf(
"DEBUG: \\0%o @ %ld\n",
1002 for (cp = buf;
idxGrep(_UTIL_HYPHEN, cp, REG_ICASE); ) {
1004 x = cp + cur.regm.rm_so;
1005 while ((x > cp) && !isspace(*x)) {
1008 printf(
"Hey! hyphenated-word [");
1009 for (++x; x <= (cp + cur.regm.rm_eo); x++) {
1012 while (!isspace(*x)) {
1018 cp += cur.regm.rm_so + 1;
1020 while (isspace(*cp)) {
1028 for (cp = buf;
idxGrep(_UTIL_MISCPUNCT, cp, REG_EXTENDED); ) {
1029 x = cp + cur.regm.rm_so;
1030 cp += cur.regm.rm_eo - 1;
1036 for (cp = buf;
idxGrep(_UTIL_LATEX, cp, REG_ICASE); ) {
1037 x = cp + cur.regm.rm_so;
1038 cp += cur.regm.rm_eo;
1052 for (cp = buf;
idxGrep(_UTIL_PRINT, cp, REG_ICASE); ) {
1053 x = cp + cur.regm.rm_so;
1054 cp += (cur.regm.rm_eo - 1);
1055 if ((x > buf) && ((*(x-1) ==
'r') || (*(x-1) ==
't'))) {
1067 for (cp = buf; *cp; ) {
1072 }
else if (*cp == INVISIBLE) {
1086 while ( *cp == INVISIBLE) {
1096 printf(
"***** Now buffer %p contains %d bytes (%d clipped)\n", buf,
1097 (
int)strlen(buf), n);
1098 printf(
"+++++ [Dr-AFTER] +++++:\n%s\n[==END==]\n", buf);
void convertSpaceToInvisible(char *buf)
void removeBackslashesAndGTroffIndicators(char *buf)
Remove groff/troff font-size indicators, the literal string backslash-n and all backslahes, ala.
void doctorBuffer(char *buf, int isML, int isPS, int isCR)
Convert a buffer of multiple stuff to text-only, separated by spaces.
GArray * collapseInvisible(char *text, char invisible)
int compressDoctoredBuffer(char *textBuffer)
garbage collect: eliminate all INVISIBLE characters in the buffer
#define NULL_CHAR
NULL character.
void ignoreFunctionCalls(char *buf)
Ignore function calls to print routines.
void convertWhitespaceToSpaceAndRemoveSpecialChars(char *buf, int isCR)
Convert white-space to real spaces, and remove unnecessary punctuation.
int idxGrep(int index, char *data, int flags)
compile a regex, and perform the search (on data?)
#define isEOL(x)
Check if x points to a EOL character.
void removePunctuation(char *buf)
Clean up miscellaneous punctuation.
void removeLineComments(char *buf)
Remove comments that start at the beginning of a line.
void cleanUpPostscript(char *buf)
Remove newlines from buffer.
void removeHtmlComments(char *buf)
Remove HTML comments from buffer without removing comment text.