/* A pseudo newsfeed for innd, designed to detect and remove multiple postings of similar articles. Copyright (C) 1997, 1998 Frank D. Cringle. spam-killer is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ /* To activate the spam-killer, you need to put an entry like this: spam!:!spam,@junk,@control*:B4096/1024,Tx,Wn:/usr/local/bin/inn/spam-killer -b spam! into your newsfeeds file. If more than the repeat limit (default 10) similar postings are detected, local cancels are issued for these articles and for all subseqent similar articles */ #define NDEBUG #include #include #include #include #include #include #include #include #include #include #include #include #include #include "paths.h" #include "configdata.h" #include "logging.h" #define PROGNAME "spam-killer" /* =()<#define PATH_INEWS "@<_PATH_INEWS>@">()= */ #define PATH_INEWS "/usr/local/bin/inn/inews" #define STR_AND_LEN(x,y) { x, sizeof(x)-1, y } #define H_IGN 1 #define H_ESC 2 #define H_DROP 4 #define H_MID 8 static struct {const char *val; int len; int flag; } h_ignore[] = { /* headers that are to be ignored when comparing and escaped when reposting */ STR_AND_LEN("date",H_IGN|H_ESC), STR_AND_LEN("followup-to",H_IGN), STR_AND_LEN("message-id",H_IGN|H_ESC|H_MID), STR_AND_LEN("newsgroups",H_IGN|H_ESC), STR_AND_LEN("nntp-posting-host",H_ESC), STR_AND_LEN("path",H_IGN|H_ESC), STR_AND_LEN("xref",H_IGN|H_DROP), STR_AND_LEN("x-trace",H_IGN|H_DROP), STR_AND_LEN("nntp-posting-date",H_IGN|H_ESC) }; static void process(FILE *f); static void check(char *fname); static void *xmalloc(size_t size); static void save_checkpoint(void); static void load_checkpoint(int load); long hist_size = 8192; long next_free; long msgnum; int limit = 10; int do_repost = 1; int testmode = 0; FILE *expover; #define CP_FILE _PATH_NEWSLIB "/spam-killer.cp" #define CP_MAGIC "CP\r\n" #define CP_MAGIC_SIZE ((sizeof CP_MAGIC) - 1) #define OVERSPAM "out.going/over.spam" static volatile int sig; static void intr(int s) { sig = s; } static void die(int s) { if (testmode) fprintf(stderr, "caught signal %d\n", s); else syslog(L_ERROR, "caught signal %d (%m)", s); if (expover) fclose(expover); while (1) /* we are sick - innd should not restart us */ sleep(10000); } int main(int argc, char **argv) { int c; int do_load_cp = 1; int run_expover = 1; char *backlog = ""; FILE *bf; while ((c = getopt(argc, argv, "b:chl:opt")) != EOF) switch (c) { case 'b': backlog = optarg; break; case 'c': do_load_cp = 0; break; case 'h': hist_size = atoi(optarg); break; case 'l': limit = atoi(optarg); break; case 'o': run_expover = 0; break; case 'p': do_repost = 0; break; case 't': testmode = 1; break; case '?': fputs("Usage: " PROGNAME " {-b -c -h -l -p -t}\n" "\t-b\tname of backlog file (or empty string to disable)\n" "\t-c\tdo not load checkpoint\n" "\t-h nn\tnumber of article signatures to remember\n" "\t-l\tspam threshold (default 10)\n" "\t-o\tdo not batch articles for expireover\n" "\t-p\tdo not repost article to spam group\n" "\t-t\ttest mode\n", stderr); exit(1); } if (!testmode) openlog(PROGNAME, 0, LOG_INN_PROG); signal(SIGILL, die); signal(SIGIOT, die); signal(SIGABRT, die); signal(SIGFPE, die); signal(SIGBUS, die); signal(SIGSEGV, die); load_checkpoint(do_load_cp); signal(SIGINT, intr); signal(SIGPIPE, SIG_IGN); if (chdir(_PATH_SPOOL) < 0) { if (testmode) fputs("cant cd to " _PATH_SPOOL "\n", stderr); else syslog(L_ERROR, "cant cd to " _PATH_SPOOL " (%m)"); exit(1); } if (run_expover) { expover = fopen(OVERSPAM, "w"); if (expover == NULL) { if (testmode) fputs("cant create " OVERSPAM "\n", stderr); else syslog(L_ERROR, "cant create " OVERSPAM " (%m)"); } } if ((bf = fopen(backlog, "r")) != NULL) { unlink(backlog); process(bf); fclose(bf); } process(stdin); if (expover) fclose(expover); save_checkpoint(); exit(sig); } static void process(FILE *in) { char fname[BUFSIZ]; while (fgets(fname, sizeof(fname), in)) { char *p; if (sig) break; if (strncmp(fname, "control", 7) == 0) continue; if (*fname == '!') { if (strncmp(fname, "!flush", 6) == 0 && expover) { fclose(expover); expover = fopen(OVERSPAM, "w"); if (expover == NULL) { if (testmode) fputs("cant recreate " OVERSPAM "\n", stderr); else syslog(L_ERROR, "cant recreate " OVERSPAM " (%m)"); } } continue; } p = fname + strlen(fname); if (p == fname || *--p != '\n') continue; /* ??? */ *p = 0; check(fname); if (sig) break; } } typedef unsigned short crc_t; static crc_t crctab[256] = { 0x0000, 0x1021, 0x2042, 0x3063, 0x4084, 0x50a5, 0x60c6, 0x70e7, 0x8108, 0x9129, 0xa14a, 0xb16b, 0xc18c, 0xd1ad, 0xe1ce, 0xf1ef, 0x1231, 0x0210, 0x3273, 0x2252, 0x52b5, 0x4294, 0x72f7, 0x62d6, 0x9339, 0x8318, 0xb37b, 0xa35a, 0xd3bd, 0xc39c, 0xf3ff, 0xe3de, 0x2462, 0x3443, 0x0420, 0x1401, 0x64e6, 0x74c7, 0x44a4, 0x5485, 0xa56a, 0xb54b, 0x8528, 0x9509, 0xe5ee, 0xf5cf, 0xc5ac, 0xd58d, 0x3653, 0x2672, 0x1611, 0x0630, 0x76d7, 0x66f6, 0x5695, 0x46b4, 0xb75b, 0xa77a, 0x9719, 0x8738, 0xf7df, 0xe7fe, 0xd79d, 0xc7bc, 0x48c4, 0x58e5, 0x6886, 0x78a7, 0x0840, 0x1861, 0x2802, 0x3823, 0xc9cc, 0xd9ed, 0xe98e, 0xf9af, 0x8948, 0x9969, 0xa90a, 0xb92b, 0x5af5, 0x4ad4, 0x7ab7, 0x6a96, 0x1a71, 0x0a50, 0x3a33, 0x2a12, 0xdbfd, 0xcbdc, 0xfbbf, 0xeb9e, 0x9b79, 0x8b58, 0xbb3b, 0xab1a, 0x6ca6, 0x7c87, 0x4ce4, 0x5cc5, 0x2c22, 0x3c03, 0x0c60, 0x1c41, 0xedae, 0xfd8f, 0xcdec, 0xddcd, 0xad2a, 0xbd0b, 0x8d68, 0x9d49, 0x7e97, 0x6eb6, 0x5ed5, 0x4ef4, 0x3e13, 0x2e32, 0x1e51, 0x0e70, 0xff9f, 0xefbe, 0xdfdd, 0xcffc, 0xbf1b, 0xaf3a, 0x9f59, 0x8f78, 0x9188, 0x81a9, 0xb1ca, 0xa1eb, 0xd10c, 0xc12d, 0xf14e, 0xe16f, 0x1080, 0x00a1, 0x30c2, 0x20e3, 0x5004, 0x4025, 0x7046, 0x6067, 0x83b9, 0x9398, 0xa3fb, 0xb3da, 0xc33d, 0xd31c, 0xe37f, 0xf35e, 0x02b1, 0x1290, 0x22f3, 0x32d2, 0x4235, 0x5214, 0x6277, 0x7256, 0xb5ea, 0xa5cb, 0x95a8, 0x8589, 0xf56e, 0xe54f, 0xd52c, 0xc50d, 0x34e2, 0x24c3, 0x14a0, 0x0481, 0x7466, 0x6447, 0x5424, 0x4405, 0xa7db, 0xb7fa, 0x8799, 0x97b8, 0xe75f, 0xf77e, 0xc71d, 0xd73c, 0x26d3, 0x36f2, 0x0691, 0x16b0, 0x6657, 0x7676, 0x4615, 0x5634, 0xd94c, 0xc96d, 0xf90e, 0xe92f, 0x99c8, 0x89e9, 0xb98a, 0xa9ab, 0x5844, 0x4865, 0x7806, 0x6827, 0x18c0, 0x08e1, 0x3882, 0x28a3, 0xcb7d, 0xdb5c, 0xeb3f, 0xfb1e, 0x8bf9, 0x9bd8, 0xabbb, 0xbb9a, 0x4a75, 0x5a54, 0x6a37, 0x7a16, 0x0af1, 0x1ad0, 0x2ab3, 0x3a92, 0xfd2e, 0xed0f, 0xdd6c, 0xcd4d, 0xbdaa, 0xad8b, 0x9de8, 0x8dc9, 0x7c26, 0x6c07, 0x5c64, 0x4c45, 0x3ca2, 0x2c83, 0x1ce0, 0x0cc1, 0xef1f, 0xff3e, 0xcf5d, 0xdf7c, 0xaf9b, 0xbfba, 0x8fd9, 0x9ff8, 0x6e17, 0x7e36, 0x4e55, 0x5e74, 0x2e93, 0x3eb2, 0x0ed1, 0x1ef0 }; #define N_CRC 16 #if (N_CRC&1 || N_CRC < 4) #error N_CRC must be an even number >= 4 #endif struct sig { crc_t s[N_CRC]; }; #ifdef FREQUENCY_CHECK int ftab[N_CRC][0x10000]; #endif #define NOT_FOUND USHRT_MAX typedef unsigned short idx_t; struct hindex { idx_t parent, left, right; short height; }; #define PARENT(tx,n) (history[n].tree[tx].parent) #define LEFT(tx,n) (history[n].tree[tx].left) #define RIGHT(tx,n) (history[n].tree[tx].right) #define HEIGHT(tx,n) (history[n].tree[tx].height) #define HT_VAL(tx,n) (n == NOT_FOUND ? -1 : (int)HEIGHT(tx,n)) struct chain { struct chain *next; char *name; }; struct chain *freechain; idx_t root[4] = { NOT_FOUND, NOT_FOUND, NOT_FOUND, NOT_FOUND }; idx_t basemark; struct history { struct hindex tree[4]; unsigned short hits; idx_t mark; unsigned long num; struct sig signature; struct chain *info; } *history; inline static char * crc(crc_t *val, char *p, char *end) { while (p < end) { *val = crctab[((*val >> 8) & 255)] ^ (*val << 8) ^ *p; if (*p++ == '\n') break; } return p; } #define BALANCE(tx,n) (HT_VAL(tx,RIGHT(tx,n)) - HT_VAL(tx,LEFT(tx,n))) #define SETHEIGHT(tx,n) { \ int hl = HT_VAL(tx,LEFT(tx,n)); \ int hr = HT_VAL(tx,RIGHT(tx,n)); \ HEIGHT(tx,n) = hl < hr ? hr + 1 : hl + 1; \ } inline static void reparent(int tx, int child, int exchild) { int p; if (child == exchild) return; p = PARENT(tx, exchild); if (p == NOT_FOUND) root[tx] = child; else if (LEFT(tx,p) == exchild) LEFT(tx,p) = child; else { assert(RIGHT(tx,p) == exchild); RIGHT(tx,p) = child; } if (child != NOT_FOUND) PARENT(tx,child) = p; } static int rotate_right(int tx, int n) { int nr, t; int left = LEFT(tx,n); if (BALANCE(tx,left) <= 0) { nr = left; reparent(tx,nr,n); PARENT(tx,n) = nr; t = RIGHT(tx,nr); LEFT(tx,n) = t; if (t != NOT_FOUND) PARENT(tx,t) = n; RIGHT(tx,nr) = n; } else { nr = RIGHT(tx,left); reparent(tx,nr,n); PARENT(tx,n) = PARENT(tx,left) = nr; t = RIGHT(tx,nr); LEFT(tx,n) = t; if (t != NOT_FOUND) PARENT(tx,t) = n; t = LEFT(tx,nr); RIGHT(tx,left) = t; if (t != NOT_FOUND) PARENT(tx,t) = left; LEFT(tx,nr) = left; RIGHT(tx,nr) = n; SETHEIGHT(tx,left); } SETHEIGHT(tx,n); SETHEIGHT(tx,nr); return nr; } static int rotate_left(int tx, int n) { int nr, t; int right = RIGHT(tx,n); if (BALANCE(tx,right) >= 0) { nr = right; reparent(tx,nr,n); PARENT(tx,n) = nr; t = LEFT(tx,nr); RIGHT(tx,n) = t; if (t != NOT_FOUND) PARENT(tx,t) = n; LEFT(tx,nr) = n; } else { nr = LEFT(tx,right); reparent(tx,nr,n); PARENT(tx,n) = PARENT(tx,right) = nr; t = LEFT(tx,nr); RIGHT(tx,n) = t; if (t != NOT_FOUND) PARENT(tx,t) = n; t = RIGHT(tx,nr); LEFT(tx,right) = t; if (t != NOT_FOUND) PARENT(tx,t) = right; RIGHT(tx,nr) = right; LEFT(tx,nr) = n; SETHEIGHT(tx,right); } SETHEIGHT(tx, n); SETHEIGHT(tx, nr); return nr; } static void rebalance(int tx, int n) { while (1) { int hl = HT_VAL(tx, LEFT(tx, n)); int hr = HT_VAL(tx, RIGHT(tx, n)); if (hr - hl < -1) n = rotate_right(tx, n); else if (hr - hl > 1) n = rotate_left(tx, n); else { int h = hl > hr ? hl + 1 : hr + 1; if (h == HT_VAL(tx,n)) return; HEIGHT(tx,n) = h; } if (PARENT(tx,n) == NOT_FOUND) { root[tx] = n; return; } n = PARENT(tx,n); } } static int cmp0(int a, int b) { struct sig *sa = &history[a].signature; struct sig *sb = &history[b].signature; return memcmp(sa->s, sb->s, sizeof(*sa)); } static int cmp1(int a, int b) { struct sig *sa = &history[a].signature; struct sig *sb = &history[b].signature; int d = memcmp(sa->s+N_CRC/2, sb->s+N_CRC/2, sizeof(*sa)/2); return d ? d : memcmp(sa->s, sb->s, sizeof(*sa->s)/2); } static int cmp2(int a, int b) { struct sig *sa = &history[a].signature; struct sig *sb = &history[b].signature; int i; for (i = N_CRC-1; i; i--) { int d = sa->s[i] - sb->s[i]; if (d) return d; } return sa->s[0] - sb->s[0]; } static int cmp3(int a, int b) { double d = (double) history[a].hits/(msgnum-history[a].num) - (double) history[b].hits/(msgnum-history[b].num); return d > 0.0 ? 1 : -1; } int (*cmp[4])(int a, int b) = { cmp0, cmp1, cmp2, cmp3 }; static void insert(int n, int tx) { int r = root[tx]; int d = 0; int p = 0; int (*compare)(int, int) = cmp[tx]; PARENT(tx,n) = LEFT(tx,n) = RIGHT(tx,n) = NOT_FOUND; HEIGHT(tx,n) = 0; if (r == NOT_FOUND) { root[tx] = n; return; } while (r != NOT_FOUND) { assert(r < next_free); d = compare(n, r); assert(d != 0); p = r; r = (d < 0) ? LEFT(tx,r) : RIGHT(tx,r); } if (d < 0) LEFT(tx,p) = n; else RIGHT(tx,p) = n; LEFT(tx,n) = RIGHT(tx,n) = NOT_FOUND; PARENT(tx,n) = p; HEIGHT(tx,n) = 0; HEIGHT(tx,p) = -2; rebalance(tx, p); } static void del(int tx, int n, int r) { int t; if (RIGHT(tx,n) != NOT_FOUND) { del(tx,RIGHT(tx,n),r); } else { int p = PARENT(tx,n); if (RIGHT(tx,p) == n) { RIGHT(tx,p) = NOT_FOUND; reparent(tx,n,r); t = LEFT(tx,n); RIGHT(tx,p) = t; if (t != NOT_FOUND) PARENT(tx,t) = p; LEFT(tx,n) = LEFT(tx,r); RIGHT(tx,n) = RIGHT(tx,r); PARENT(tx,LEFT(tx,r)) = PARENT(tx,RIGHT(tx,r)) = n; HEIGHT(tx,n) = HEIGHT(tx,r); HEIGHT(tx,p) = -2; rebalance(tx,p); } else { assert(LEFT(tx,p) == n && p == r); reparent(tx,n,r); RIGHT(tx,n) = RIGHT(tx,r); PARENT(tx,RIGHT(tx,r)) = n; HEIGHT(tx,n) = -2; rebalance(tx,n); } } } static void delete(int tx, int n) { int p; p = PARENT(tx,n); if (HEIGHT(tx,n) == 0) { reparent(tx,NOT_FOUND,n); rebalance(tx,p); return; } else if (RIGHT(tx,n) == NOT_FOUND) { reparent(tx,LEFT(tx,n),n); rebalance(tx,p); } else if (LEFT(tx,n) == NOT_FOUND) { reparent(tx,RIGHT(tx,n),n); rebalance(tx,p); } else del(tx,LEFT(tx,n),n); } static void filter(int in, int out) { int head = 1; char line[BUFSIZ]; FILE *fin, *fout; if ((fin = fdopen(in, "r")) == NULL || (fout = fdopen(out, "w")) == NULL) return; while (fgets(line, sizeof(line), fin)) { if (head) { int i; for (i = 0; i < sizeof(h_ignore) / sizeof(h_ignore[0]); i++) if ((h_ignore[i].flag&(H_ESC|H_MID|H_DROP)) && strncasecmp(line, h_ignore[i].val, h_ignore[i].len) == 0) break; if (i < sizeof(h_ignore) / sizeof(h_ignore[0])) { if (h_ignore[i].flag & H_DROP) continue; if (h_ignore[i].flag & H_MID) { char *p = strchr(line, '<'); if (p) { fwrite(line, 1, p-line+1, fout); fputs("cancel.", fout); fputs(p+1, fout); } } fputs("X-Original-", fout); line[511-sizeof("X-Original-")] = 0; } head &= line[0] != '\n'; } fputs(line, fout); } fclose(fin); fclose(fout); } static void repost(char *name) { int p[2]; int fd; char * const args[] = { "inews", "-h", "-O", "-S", "-R", "-nspam", NULL }; /* skip cancels */ if (strncmp("control", name, 7) == 0) return; /* we ignore errors here */ if ((fd = open(name, O_RDONLY)) < 0) return; switch(FORK()) { case -1: return; case 0: break; default: wait(NULL); /* parent just waits for 1st child */ close(fd); return; } switch(FORK()) { case -1: _exit(1); case 0: break; default: /* 1st child just starts 2nd child and exits quickly */ _exit(0); } if (pipe(p) == -1) _exit(1); switch(FORK()) { case -1: _exit(1); case 0: break; default: close(p[0]); filter(fd,p[1]); /* 2nd child writes filtered article to 3rd child */ _exit(0); } /* 3rd child starts inews */ close(0); close(fd); dup2(p[0],0); close(p[0]); close(p[1]); execv(PATH_INEWS, args); _exit(1); } static void cancel(int n) { struct chain *c = history[n].info; struct chain *c1; if (testmode) { int i; printf("cancel: %4d (%d) ", n, history[n].hits); for (i = 0; i < N_CRC; i++) printf("%04X ", history[n].signature.s[i]); putchar('\n'); } if (!c) return; while (c) { if (testmode) { fputs(c->name, stdout); putchar(' '); } else if (unlink(c->name) == 0) syslog(L_TRACE, "removed spam %s", c->name); else syslog(L_NOTICE, "can't unlink %s (%m)", c->name); if (expover) { fputs(c->name, expover); putc('\n', expover); } free(c->name); c1 = c->next; c->next = freechain; freechain = c; c = c1; } if (testmode) putchar('\n'); else if (expover) fflush(expover); history[n].info = NULL; } static int next_left(int ix, int n) { int x; if (n == NOT_FOUND) return NOT_FOUND; x = LEFT(ix,n); if (x == NOT_FOUND) { int r = n; for (x = PARENT(ix,r); x != NOT_FOUND; r = x, x = PARENT(ix,r)) if (RIGHT(ix,x) == r) { if (r == n) r = x; return r; } return NOT_FOUND; } while (RIGHT(ix,x) != NOT_FOUND) x = RIGHT(ix,x); return x; } static int next_right(int ix, int n) { int x; if (n == NOT_FOUND) return NOT_FOUND; x = RIGHT(ix,n); if (x == NOT_FOUND) { int r = n; for (x = PARENT(ix,r); x != NOT_FOUND; r = x, x = PARENT(ix,r)) if (LEFT(ix,x) == r) { if (r == n) r = x; return r; } return NOT_FOUND; } while (LEFT(ix,x) != NOT_FOUND) x = LEFT(ix,x); return x; } inline static int similar(struct sig *s, int j) { int k, miss; struct sig *t; if (j == NOT_FOUND) return 0; t = &history[j].signature; for (k = miss = 0; k < N_CRC && miss < 3; k++) miss += (s->s[k] != t->s[k]); if (miss > 2) return 0; return history[j].hits; } static int sim_sum(struct sig *s, int n) { int sum; int ix; if (n == NOT_FOUND || history[n].mark != NOT_FOUND) return 0; sum = similar(s, n); if (sum == 0) return 0; if (basemark == NOT_FOUND) basemark = history[n].mark = n; else { history[n].mark = basemark; basemark = n; } for (ix = 0; ix < 3; ix++) { sum += sim_sum(s, next_left(ix,n)); sum += sim_sum(s, next_right(ix,n)); } return sum; } static int find_oldest(void) { int n = root[3]; if (n != NOT_FOUND) while (LEFT(3, n) != NOT_FOUND) n = LEFT(3, n); return n; } static int find(struct sig *this, int r) { while (r != NOT_FOUND) { struct sig *sb = &history[r].signature; int d = memcmp(this->s, sb->s, sizeof(*this)); if (d == 0) return r; if (d < 0) r = LEFT(0,r); else r = RIGHT(0,r); } return r; } #ifndef NDEBUG static void showtree(int ix, int r, int h) { int i; if (r == NOT_FOUND) return; if (RIGHT(ix,r) != NOT_FOUND) showtree(ix, RIGHT(ix,r), h+1); fprintf(stderr, "%d", ix); for (i = 0; i < h; i++) fputs(" ", stderr); fprintf(stderr, "%5d(%d) %5d %5d %04X %04X %04X\n", r, HEIGHT(ix,r), LEFT(ix,r), RIGHT(ix,r), history[r].signature.s[7], history[r].signature.s[8], history[r].signature.s[9]); if (LEFT(ix,r) != NOT_FOUND) showtree(ix, LEFT(ix,r), h+1); } static int checktree(int ix, int r) { int h, hl, hr; if (r == NOT_FOUND) return -1; hl = checktree(ix, LEFT(ix, r)); hr = checktree(ix, RIGHT(ix, r)); assert(LEFT(ix,r) == NOT_FOUND || cmp[ix](LEFT(ix, r), r) < 0); assert(RIGHT(ix,r) == NOT_FOUND || cmp[ix](r, RIGHT(ix, r)) < 0); if (LEFT(ix,r)!=NOT_FOUND) assert(PARENT(ix,LEFT(ix,r)) == r); if (RIGHT(ix,r)!=NOT_FOUND) assert(PARENT(ix,RIGHT(ix,r)) == r); h = 1 + (hl > hr ? hl : hr); assert(h == HEIGHT(ix,r)); return h; } static void testtrees(void) { int ix; for (ix = 0; ix < 3; ix++) { int r = root[ix]; if (r == NOT_FOUND) continue; assert(PARENT(ix,r) == NOT_FOUND); checktree(ix, r); } if ((msgnum & 63) == 0) { if (root[3] == NOT_FOUND) return; assert(PARENT(ix,root[3]) == NOT_FOUND); checktree(ix, root[3]); } } #endif static int countgroups(char *beg, char *end, char *name) { int len = strlen(name); while (beg < end) { if (strncasecmp(beg, name, len) == 0) { int n = 0; char *eoh = beg + len; while (eoh < end-1 && (*eoh != '\n' || *(eoh+1) == '\t' || *(eoh+1) == ' ')) if (*eoh++ == ',') n++; return n+1; } while (beg < end && *beg != '\n') beg++; if (*beg == '\n') beg++; } return 0; } static void check(char *fname) { int fd, sbl, ns, ng, nf; char *fp, *fbase, *body, *fend; int ix, n; struct sig this; struct chain *c; struct stat st; #ifndef NDEBUG testtrees(); #endif if ((fd = open(fname, O_RDONLY)) < 0) return; if (fstat(fd, &st) < 0) { close(fd); return; } fbase = (char *)mmap((caddr_t)NULL, st.st_size, PROT_READ, MAP_SHARED, fd, 0); fend = fbase + st.st_size; if (fbase == (char *)-1) { close(fd); return; } for (body = fbase; body < fend; ) { char *p = memchr(body, '\n', fend - body - 2); body = p ? p+1 : fend; if (body[0] == '\n') { body++; break; } } sbl = 0; /* number of significant body lines */ ns = 0; /* seen a non-space */ ng = countgroups(fbase, body, "newsgroups:"); nf = (ng > 3) ? ng : 3; /* only need to count 3 or ng body lines */ for (fp = body; fp < fend && sbl < nf; fp++) { if (*fp == '\n') { sbl += ns; ns = 0; continue; } ns |= (*fp != ' '); } nf = countgroups(fbase, body, "followup-to:"); if (nf == 0) nf = ng; if (ng + nf*nf > 60 || ng > sbl) { close(fd); munmap(fbase, st.st_size); if (testmode) puts(fname); else if (unlink(fname) == 0) syslog(L_TRACE, "removed excessive crosspost %s", fname); else syslog(L_NOTICE, "can't unlink %s (%m)", fname); if (expover) { fputs(fname, expover); putc('\n', expover); fflush(expover); } return; } memset(&this, 0, sizeof this); ix = 0; #define SKIP() { \ char *p = memchr(fp, '\n', fend - fp - 1); \ fp = p ? p+1 : fend; \ continue; \ } if (sbl < 3) { /* treat header lines individually if body is short */ ns = 0; for (fp = fbase; fp < body - 1; ) { int i; for (i = 0; i < sizeof(h_ignore) / sizeof(h_ignore[0]); i++) if ((h_ignore[i].flag&H_IGN) && fend - fp > h_ignore[i].len && strncasecmp(fp, h_ignore[i].val, h_ignore[i].len) == 0) break; if (i < sizeof(h_ignore) / sizeof(h_ignore[0])) SKIP(); fp = crc(this.s+ix, fp, fend); if (++ix >= N_CRC) ix = 0; } } else { /* crunch headers together into signature[0] */ ix = ns = 1; for (fp = fbase; fp < body - 1; ) { int i; for (i = 0; i < sizeof(h_ignore) / sizeof(h_ignore[0]); i++) if ((h_ignore[i].flag&H_IGN) && fend - fp > h_ignore[i].len && strncasecmp(fp, h_ignore[i].val, h_ignore[i].len) == 0) break; if (i < sizeof(h_ignore) / sizeof(h_ignore[0])) SKIP(); fp = crc(this.s, fp, fend); } } for (fp = body; fp < fend; ) { if (fend - fp > 3 && fp[0] == '-' && fp[1] == '-' && fp[3] != ' ' && fp[3] != '\n' && fp[3] != '\t') SKIP(); /* skip multi-part separators, they are often random */ fp = crc(this.s+ix, fp, fend); if (++ix >= N_CRC) ix = ns; } close(fd); munmap(fbase, st.st_size); #ifdef FREQUENCY_CHECK for (ix = 0; ix < N_CRC; ix++) ftab[ix][this.s[ix]]++; #endif ix = 0; if ((n = find(&this, root[0])) != NOT_FOUND) { history[n].hits++; history[n].num = msgnum; } else { /* make room, insert */ if (next_free < hist_size) n = next_free++; else { struct chain *c1; n = find_oldest(); for (c = history[n].info; c; c = c1) { free(c->name); c1 = c->next; c->next = freechain; freechain = c; } for (ix = 0; ix < 4; ix++) delete(ix,n); } memcpy(&history[n].signature, &this, sizeof this); history[n].hits = 1; history[n].info = NULL; history[n].num = msgnum; for (ix = 0; ix < 4; ix++) insert(n, ix); } msgnum++; if (freechain == NULL) { int i; freechain = xmalloc(100*sizeof(struct chain)); for (i = 0; i < 99; i++) freechain[i].next = freechain + i + 1; freechain[99].next = NULL; } c = freechain; freechain = freechain->next; c->next = history[n].info; history[n].info = c; c->name = xmalloc(strlen(fname) + 1); strcpy(c->name, fname); { struct sig *s = &history[n].signature; int sum; int can_it = 0; basemark = NOT_FOUND; sum = sim_sum(s,n); if (sum >= limit) { if (do_repost && sum == limit) repost(fname); can_it = 1; } while (basemark != NOT_FOUND) { int i = history[basemark].mark; if (can_it) cancel(basemark); history[basemark].mark = NOT_FOUND; basemark = i; } } /* periodically re-sort the hit-rate tree, because the sort condition is time-dependent and we want to have quick access to the node with the lowest hit-rate */ if ((msgnum & 63) == 0) { root[3] = NOT_FOUND; for (ix = 0; ix < next_free; ix++) insert(ix, 3); } } static void save_checkpoint(void) { int ch_count, i, tp; FILE *cp; #ifdef FREQUENCY_CHECK for (i = 0; i < N_CRC; i++) { int mx = ftab[i][0]; int mi = 0; int j; for (j = 1; j < 0x10000; j++) if (ftab[i][j] > mx) { mx = ftab[i][j]; mi = j; } printf("ftab[%d][%d] = %d\n", i, mi, mx); } #endif rename(CP_FILE, CP_FILE ".old"); if ((cp = fopen(CP_FILE, "w")) == NULL) { if (testmode) perror(CP_FILE); else syslog(L_ERROR, "cant create " CP_FILE ": %m"); return; } fwrite(CP_MAGIC, CP_MAGIC_SIZE, 1, cp); fwrite(&hist_size, sizeof hist_size, 1, cp); fwrite(&next_free, sizeof next_free, 1, cp); fwrite(&msgnum, sizeof msgnum, 1, cp); fwrite(root, sizeof root, 1, cp); for (ch_count = 1, i = 0; i < next_free; i++) { struct history h = history[i]; if (h.info) { struct chain *p; int c = ch_count; for (p = h.info; p; p = p->next) ch_count++; h.info = (struct chain *) c; } fwrite(&h, sizeof h, 1, cp); } fwrite(&ch_count, sizeof ch_count, 1, cp); for (tp = 0, ch_count = 1, i = 0; i < next_free; i++) { struct chain *p = history[i].info; if (p == NULL) continue; ch_count++; while (p) { struct chain c = *p; c.next = (struct chain *) (c.next ? ch_count++ : 0); c.name = (char *) tp; tp += strlen(p->name) + 1; fwrite(&c, sizeof c, 1, cp); p = p->next; } } for (i = 0; i < next_free; i++) { struct chain *p = history[i].info; while (p) { fputs(p->name, cp); fputc('\n', cp); p = p->next; } } if (!ferror(cp) && fclose(cp) == 0) unlink(CP_FILE ".old"); else if (testmode) perror(CP_FILE); else syslog(L_ERROR, "problem writing checkpoint to " CP_FILE ": %m"); } static void load_checkpoint(int load) { long vals[3]; char magic[CP_MAGIC_SIZE]; idx_t r[4]; int ch_count, i; struct chain *chainp; FILE *cp; if (!load) goto alloc; sleep(2); /* allow time for previous incarnation to dump its checkpoint */ if ((cp = fopen(CP_FILE, "r")) == NULL || fread(magic, CP_MAGIC_SIZE, 1, cp) != 1 || memcmp(magic, CP_MAGIC, CP_MAGIC_SIZE) != 0 || fread(vals, sizeof(long), 3, cp) != 3) goto fail; if (fread(r, sizeof(idx_t), 4, cp) != 4) goto fail; if (vals[0] > hist_size) { syslog(L_ERROR, "checkpoint history size(%d) > requested(%d)", vals[0], hist_size); hist_size = vals[0]; } history = xmalloc(hist_size * sizeof(*history)); memset(history, 0, hist_size * sizeof(*history)); if (fread(history, sizeof(*history), (unsigned) vals[1], cp) != vals[1] || fread(&ch_count, sizeof ch_count, 1, cp) != 1) goto fail; ch_count--; chainp = xmalloc(ch_count * sizeof(*chainp)); if (fread(chainp, sizeof(*chainp), (unsigned) ch_count, cp) != ch_count) goto fail; for (i = 0; i < vals[1]; i++) { struct chain *p; if (history[i].info) { history[i].info = chainp + (int) history[i].info - 1; for (p = history[i].info; p->next; p = p->next) p->next = chainp + (int) p->next - 1; } } for (i = 0; i < ch_count; i++) { char buf[BUFSIZ]; unsigned int len; if (fgets(buf, BUFSIZ, cp) == NULL) goto fail; len = strlen(buf); if (buf[len-1] != '\n') goto fail; buf[len-1] = 0; chainp[i].name = xmalloc(len); memcpy(chainp[i].name, buf, len); } fclose(cp); next_free = vals[1]; msgnum = vals[2]; memcpy(root, r, sizeof(r)); for (i = 0; i < hist_size; i++) history[i].mark = NOT_FOUND; return; fail: if (testmode) perror(CP_FILE); else syslog(L_ERROR, "cant read " CP_FILE ": %m"); if (history) free(history); if (cp) fclose(cp); alloc: history = xmalloc(hist_size * sizeof(*history)); memset(history, 0, hist_size * sizeof(*history)); for (i = 0; i < hist_size; i++) history[i].mark = NOT_FOUND; } static void * xmalloc(size_t size) { void *p = malloc(size); if (p == NULL) { if (testmode) fputs("insufficient memory", stderr); else syslog(L_ERROR, "insufficient memory"); exit(1); } return p; }