| 1 | |
|---|
| 2 | /* setup filesnames |
|---|
| 3 | */ |
|---|
| 4 | |
|---|
| 5 | if (!*colcdb) sprintf(colcdb,"%s.c",collectiondb); |
|---|
| 6 | if (!*colndb) sprintf(colndb,"%s.n",collectiondb); |
|---|
| 7 | if (!*colxdb) sprintf(colxdb,"%s.x",collectiondb); |
|---|
| 8 | if (!*colvdb) sprintf(colvdb,"%s.v",collectiondb); |
|---|
| 9 | |
|---|
| 10 | |
|---|
| 11 | /* collection processing |
|---|
| 12 | */ |
|---|
| 13 | /* alloc///colcdb/colndb/coljdb/docvdb_V3 */ |
|---|
| 14 | for (irec=maxnrec; irec--; ) { if (!vrecp[irec]) /* ja' decrementado */ break; } |
|---|
| 15 | if (irec<0L) fatal("wtrig2/irec"); |
|---|
| 16 | recallok(irec,awtmaxmfrl); |
|---|
| 17 | |
|---|
| 18 | /* alloc///colvdb - w2rcol.c */ |
|---|
| 19 | for (irectv=maxnrec; irectv--; ) { if (!vrecp[irectv]) /* ja' decrementado */ break; } |
|---|
| 20 | if (irectv<0L) fatal("wtrig2/irectv"); |
|---|
| 21 | recallok(irectv,awtmaxmfrl); |
|---|
| 22 | |
|---|
| 23 | |
|---|
| 24 | #if JDIN440 |
|---|
| 25 | /* assumes jd comes in TAG440 |
|---|
| 26 | */ |
|---|
| 27 | RECORD(irec,collectiondb,1); |
|---|
| 28 | if (RECrc != RCNORMAL) fatal("wtrig2/COLLECTION/RCNORMAL"); |
|---|
| 29 | if (fieldx(irec,TAG440,1) >= 0) readcolljd=1; |
|---|
| 30 | #else |
|---|
| 31 | #endif |
|---|
| 32 | |
|---|
| 33 | /* get collection size, classes, width, etc from file |
|---|
| 34 | */ |
|---|
| 35 | RECORD(irec,colcdb,1); |
|---|
| 36 | if (RECrc != RCNORMAL) fatal("wtrig2/COLLECTION_SIZE/RCNORMAL"); |
|---|
| 37 | |
|---|
| 38 | /* get collection size: N */ |
|---|
| 39 | readcollection_size=atol(recfield(awtdatabuff,irec,TAG2,1,"")); |
|---|
| 40 | /* get max postings in inverteddb: HN */ |
|---|
| 41 | readpostingson_size=atol(recfield(awtdatabuff,irec,TAG3,1,"")); |
|---|
| 42 | |
|---|
| 43 | /* get other parms */ |
|---|
| 44 | for (iocc=1; *recfield(awtdatabuff,irec,TAG1,iocc,""); iocc++) { |
|---|
| 45 | char *q; |
|---|
| 46 | q="^nK^v"; |
|---|
| 47 | p=strstr(awtdatabuff,q); if (p) readcategories=atol(p+strlen(q)); |
|---|
| 48 | q="^nwidth^v"; |
|---|
| 49 | p=strstr(awtdatabuff,q); if (p) readwidth=atoi(p+strlen(q)); |
|---|
| 50 | q="^nmaxprim^v"; |
|---|
| 51 | p=strstr(awtdatabuff,q); if (p) readmaxprim=atol(p+strlen(q)); |
|---|
| 52 | q="^nmaxtv^v"; |
|---|
| 53 | p=strstr(awtdatabuff,q); if (p) readmaxtv=atoi(p+strlen(q)); |
|---|
| 54 | q="^nnterms^v"; |
|---|
| 55 | p=strstr(awtdatabuff,q); if (p) readnterms=atoi(p+strlen(q)); |
|---|
| 56 | q="^nlanguages^v"; |
|---|
| 57 | p=strstr(awtdatabuff,q); if (p) readnlangs=atoi(p+strlen(q)); |
|---|
| 58 | q="^nbin^v"; |
|---|
| 59 | p=strstr(awtdatabuff,q); if (p) readwritebin=atoi(p+strlen(q)); |
|---|
| 60 | |
|---|
| 61 | q="^nmaxrf^v"; |
|---|
| 62 | p=strstr(awtdatabuff,q); if (p) readmaxrf=atof(p+strlen(q)); |
|---|
| 63 | |
|---|
| 64 | q="^ndmfn^v"; // when collection was wtrig1'ed |
|---|
| 65 | p=strstr(awtdatabuff,q); if (p) readcollmfntag=atoi(p+strlen(q)); |
|---|
| 66 | |
|---|
| 67 | if (readpostingson_size < 1) readpostingson_size=readcollection_size*readmaxrf; |
|---|
| 68 | |
|---|
| 69 | #if ALLOWUPCASE |
|---|
| 70 | q="^ncase^v"; |
|---|
| 71 | p=strstr(awtdatabuff,q); if (p) { |
|---|
| 72 | char *x=strchr(q=p+strlen(q),'^'); |
|---|
| 73 | if (x) { |
|---|
| 74 | char c= *x; |
|---|
| 75 | *x='\0'; |
|---|
| 76 | memset(xisiswctab,0x00,sizeof(xisisuctab)); |
|---|
| 77 | xisiswctot=0; |
|---|
| 78 | if (strcmp(q,"ascii") == 0) { |
|---|
| 79 | memcpy(xisisuctab,isisuctab,sizeof(xisisuctab)); |
|---|
| 80 | for (; isisactab[xisiswctot]; xisiswctot=xisiswctot+1) xisiswctab[isisactab[xisiswctot]]=1; |
|---|
| 81 | } |
|---|
| 82 | if (strcmp(q,"ansi") == 0) { |
|---|
| 83 | memcpy(xisisuctab,ansiuctab,sizeof(xisisuctab)); |
|---|
| 84 | for (; ansiactab[xisiswctot]; xisiswctot=xisiswctot+1) xisiswctab[ansiactab[xisiswctot]]=1; |
|---|
| 85 | } |
|---|
| 86 | else { |
|---|
| 87 | unsigned char acbuff[256]; |
|---|
| 88 | if (!loaductb(NULL,isisuctab,q)) fatal(q); |
|---|
| 89 | if (!loadactb(NULL,acbuff,q)) fatal(q); |
|---|
| 90 | for (; acbuff[xisiswctot]; xisiswctot=xisiswctot+1) xisiswctab[acbuff[xisiswctot]]=1; |
|---|
| 91 | } |
|---|
| 92 | *x=c; |
|---|
| 93 | } |
|---|
| 94 | } |
|---|
| 95 | #endif |
|---|
| 96 | } |
|---|
| 97 | // dbxflush(colcdb); /* close/flush */ // should work.. |
|---|
| 98 | |
|---|
| 99 | |
|---|
| 100 | if (readcollection_size < 1) fatal("wtrig2/invalid readcollection_size"); |
|---|
| 101 | if (readpostingson_size < 1) fatal("wtrig2/invalid readpostingson_size"); |
|---|
| 102 | if (readcategories < 1) fatal("wtrig2/invalid readcategories"); |
|---|
| 103 | if (readwidth < 1) fatal("wtrig2/invalid readwidth"); |
|---|
| 104 | if (readmaxprim < 1) fatal("wtrig2/invalid readmaxprim"); |
|---|
| 105 | if (readmaxtv < 1) fatal("wtrig2/invalid readmaxtv"); |
|---|
| 106 | if (readnterms < 1) fatal("wtrig2/invalid readnterms"); |
|---|
| 107 | if (readnlangs < 0) fatal("wtrig2/invalid readnlangs"); |
|---|
| 108 | if (readnterms < 1) fatal("wtrig2/invalid readnterms"); |
|---|
| 109 | if (readwritebin < 1) fatal("wtrig2/invalid readwritebin"); |
|---|
| 110 | if (readmaxrf < 0) fatal("wtrig2/invalid readmaxrf"); |
|---|
| 111 | if (readcollmfntag < 0) fatal("wtrig2/invalid readcollmfntag"); |
|---|
| 112 | if (readwidth > MAXPARMWIDTH) fatal("wtrig2/invalid readwidth"); |
|---|
| 113 | |
|---|
| 114 | /* set */ |
|---|
| 115 | parmcollmfntag=readcollmfntag; |
|---|
| 116 | parmmaxrf=readmaxrf; |
|---|
| 117 | parmmaxtv=readmaxtv; |
|---|
| 118 | parmmaxrel=PARMMAXREL; |
|---|
| 119 | parmminsim=PARMMINSIM; |
|---|
| 120 | parmmaxsim=PARMMAXSIM; |
|---|
| 121 | |
|---|
| 122 | /* get */ |
|---|
| 123 | #include "w2pcox.c" |
|---|
| 124 | /////////////////////////////////////////////////////////////////////////////////////////////////// |
|---|
| 125 | |
|---|
| 126 | /* chk */ |
|---|
| 127 | if (parmcollmfntag > readcollmfntag) parmcollmfntag=readcollmfntag; |
|---|
| 128 | if (parmmaxrf > readmaxrf) parmmaxrf=readmaxrf; |
|---|
| 129 | if (parmmaxtv > readmaxtv) parmmaxtv=readmaxtv; |
|---|
| 130 | if (parmmaxrel > readcollection_size) parmmaxrel=readcollection_size; |
|---|
| 131 | if (parmminsim > parmmaxsim) parmminsim=PARMMINSIM; |
|---|
| 132 | if (parmminsim > parmmaxsim) parmmaxsim=PARMMAXSIM; |
|---|
| 133 | |
|---|
| 134 | /* keep parms in coll structure */ |
|---|
| 135 | collcollmfntag=parmcollmfntag; |
|---|
| 136 | collmaxrf=parmmaxrf; |
|---|
| 137 | collmaxtv=parmmaxtv; |
|---|
| 138 | collmaxrel=parmmaxrel; |
|---|
| 139 | collminsim=parmminsim; |
|---|
| 140 | collmaxsim=parmmaxsim; |
|---|
| 141 | |
|---|
| 142 | collcolljd=readcolljd; |
|---|
| 143 | |
|---|
| 144 | |
|---|
| 145 | /* check collection size x collection maxmfn */ |
|---|
| 146 | COLLECTION_SIZE=readcollection_size; |
|---|
| 147 | if (parmtrace & 0x02) { |
|---|
| 148 | printf(" 2| |%"_LD_"\n",COLLECTION_SIZE); |
|---|
| 149 | } |
|---|
| 150 | |
|---|
| 151 | |
|---|
| 152 | /* allocate COLLECTION's term frequencies |
|---|
| 153 | */ |
|---|
| 154 | COLLECTION= (LONGX *) loadfile(NULL,'@',"",NULL, readnterms*sizeof(LONGX), '\0'); |
|---|
| 155 | memset(COLLECTION,0x00,readnterms*sizeof(float)); |
|---|
| 156 | |
|---|
| 157 | /* allocate COLLECTION's term global weights |
|---|
| 158 | */ |
|---|
| 159 | GLOBALW= (float *)loadfile(NULL,'@',"",NULL, readmaxprim*sizeof(float), '\0'); |
|---|
| 160 | memset(COLLECTION,0x00,readnterms*sizeof(float)); |
|---|
| 161 | |
|---|
| 162 | |
|---|
| 163 | /* allocate bsrch table |
|---|
| 164 | */ |
|---|
| 165 | table=bsrchalloc(readnterms,readwidth,&tabentries); |
|---|
| 166 | |
|---|
| 167 | if (readnterms > readcategories) fatal("wtrig2/invalid categories or nterms"); |
|---|
| 168 | if (readnterms > readmaxprim) fatal("wtrig2/invalid maxprim or nterms"); |
|---|
| 169 | |
|---|
| 170 | hwidth=(readwidth+1); |
|---|
| 171 | |
|---|
| 172 | /* and load term keys - distribution / COLLECTION / GLOBALW |
|---|
| 173 | */ |
|---|
| 174 | if (parmtrace2) printf("+++ readnterms=%"_LD_"\n",readnterms); |
|---|
| 175 | if (parmtrace2) printf("+++ readwidth=%"_LD_"\n",readwidth); |
|---|
| 176 | /* load collection terms in the hashing table |
|---|
| 177 | */ |
|---|
| 178 | count=0; |
|---|
| 179 | for (mfn=1; ; mfn++) { |
|---|
| 180 | RECORD(irec,colndb,mfn); |
|---|
| 181 | if (RECrc == RCEOF) break; |
|---|
| 182 | if (RECrc != RCNORMAL) continue; |
|---|
| 183 | for (dirp=MFRdir, xdir=0, loop=MFRnvf; loop--; dirp++, xdir++) { |
|---|
| 184 | int found; |
|---|
| 185 | LONGX ndocs; |
|---|
| 186 | LONGX hidx; |
|---|
| 187 | |
|---|
| 188 | if (dirp->tag != TAG3) continue; |
|---|
| 189 | if (dirp->len == 0) continue; // wtrig1 |
|---|
| 190 | for (keylen=0, p=fldp=FIELDP(xdir), left=dirp->len; left; ) { |
|---|
| 191 | if (*p == '^') break; |
|---|
| 192 | keylen++; p++; left--; |
|---|
| 193 | } |
|---|
| 194 | ndocs=0; |
|---|
| 195 | if (left > 2) if (*p++ == '^') if (*p++ == 'n') { /* ^n */ |
|---|
| 196 | left=left-2; |
|---|
| 197 | while (left) { |
|---|
| 198 | if (!isdigit(*p)) break; |
|---|
| 199 | ndocs=ndocs*10+((int)(*p)-(int)'0'); |
|---|
| 200 | p++; left--; |
|---|
| 201 | } |
|---|
| 202 | } |
|---|
| 203 | |
|---|
| 204 | bsrchindex(table,tabentries,readwidth,fldp,keylen,&found); |
|---|
| 205 | if (found) fatal("wtrig2/duplicated collection term"); |
|---|
| 206 | hidx=bsrchstore(table,readnterms,&tabentries,readwidth,fldp,keylen); |
|---|
| 207 | if (hidx < 0 || ndocs>COLLECTION_SIZE) fatal("wtrig2/bsrchstore/bug"); |
|---|
| 208 | |
|---|
| 209 | if (ndocs) { |
|---|
| 210 | double lognum = (double)COLLECTION_SIZE; |
|---|
| 211 | double logden = (double)ndocs; |
|---|
| 212 | COLLECTION[hidx]=ndocs; |
|---|
| 213 | GLOBALW[hidx] = (float)sqrt(log(lognum/logden)); |
|---|
| 214 | if (parmtrace & 0x02) printf(" 2|%s|%"_LD_"|%f\n",table+hidx*hwidth,COLLECTION[hidx],GLOBALW[hidx]); |
|---|
| 215 | } |
|---|
| 216 | } |
|---|
| 217 | count++; |
|---|
| 218 | //if (parmtell) if (count%parmtell == 0) fprintf(stderr,"+ %"_LD_"\n",count); |
|---|
| 219 | } |
|---|
| 220 | // dbxflush(colndb); /* close/flush */ |
|---|
| 221 | if (parmtrace2) printf("+++ tabentries=%"_LD_"\n",tabentries); |
|---|
| 222 | if (parmtell) /*if (count%parmtell == 0)*/ fprintf(stderr,"+ %"_LD_" terms in collection \n",count); |
|---|
| 223 | |
|---|
| 224 | |
|---|
| 225 | |
|---|
| 226 | /* load index |
|---|
| 227 | */ |
|---|
| 228 | /* alloc///colxdb */ |
|---|
| 229 | for (ifrec=maxnrec; ifrec--; ) { if (!vrecp[ifrec]) /* ja' decrementado */ break; } |
|---|
| 230 | if (ifrec<0L) fatal("wtrig2/ifrec"); |
|---|
| 231 | //recallok(ifrec,awtmaxmfrl); |
|---|
| 232 | recallok(ifrec,BUFSIZ+readpostingson_size*sizeof(LONGX)); |
|---|
| 233 | |
|---|
| 234 | /* mstsetup col.x */ /* AOT, 04/08/2005 */ |
|---|
| 235 | if (parmtell) fprintf(stderr,"+ loading %s term indexes..\n",colxdb); |
|---|
| 236 | mstsetup(colxdb,LONGX_MAX,LONGX_MAX); |
|---|
| 237 | RECORD(ifrec,colxdb,0L); |
|---|
| 238 | if (parmtell) fprintf(stderr,"+ %"_LD_" term indexes loaded\n",MF0nxtmfn-1); |
|---|
| 239 | |
|---|
| 240 | |
|---|
| 241 | |
|---|
| 242 | /* document processing |
|---|
| 243 | */ |
|---|
| 244 | /* alloc///docvdb */ |
|---|
| 245 | for (inirec=maxnrec; inirec--; ) { if (!vrecp[inirec]) /* ja' decrementado */ break; } |
|---|
| 246 | if (inirec<0L) fatal("wtrig2/inirec"); |
|---|
| 247 | recallok(inirec,awtmaxmfrl); |
|---|
| 248 | |
|---|
| 249 | /* alloc///docydb ctl */ |
|---|
| 250 | for (outcrec=maxnrec; outcrec--; ) { if (!vrecp[outcrec]) /* ja' decrementado */ break; } |
|---|
| 251 | if (outcrec<0L) fatal("wtrig2/outcrec"); |
|---|
| 252 | recallok(outcrec,sizeof(M0STRU)); |
|---|
| 253 | |
|---|
| 254 | /* alloc///docydb */ |
|---|
| 255 | for (outirec=maxnrec; outirec--; ) { if (!vrecp[outirec]) /* ja' decrementado */ break; } |
|---|
| 256 | if (outirec<0L) fatal("wtrig2/outirec"); |
|---|
| 257 | recallok(outirec,awtmaxmfrl); // RECsetup in w2setrt.c / w2outx.c + w2out1.c |
|---|
| 258 | |
|---|
| 259 | /* alloc document' term vector |
|---|
| 260 | */ |
|---|
| 261 | docvector= (DOCVEC *)loadfile(NULL,'@',"",NULL,readmaxtv*sizeof(DOCVEC),'\0'); |
|---|
| 262 | docvectcnt=0; |
|---|
| 263 | |
|---|
| 264 | /* allocate area for term index hits |
|---|
| 265 | */ |
|---|
| 266 | hitbytes=COLLECTION_SIZE/8+1; /* bit string size */ |
|---|
| 267 | HITS= (char *)loadfile(NULL,'@',"",NULL,hitbytes,'\0'); |
|---|
| 268 | |
|---|
| 269 | /* allocate DOCUMENT's term frequencies and weights |
|---|
| 270 | */ |
|---|
| 271 | DOCUMENT= (int *) loadfile(NULL,'@',"",NULL, readnterms*sizeof(int), '\0'); |
|---|
| 272 | PRODW= (float *)loadfile(NULL,'@',"",NULL, readnterms*sizeof(float), '\0'); |
|---|
| 273 | WEIGHT= (float *)loadfile(NULL,'@',"",NULL, readnterms*sizeof(float), '\0'); |
|---|
| 274 | |
|---|
| 275 | /* allocate list of related |
|---|
| 276 | */ |
|---|
| 277 | lista=(LISTA *)loadfile(NULL,'@',"",NULL,parmmaxrel*sizeof(LISTA),'\0'); |
|---|
| 278 | |
|---|
| 279 | |
|---|
| 280 | /* Program WTRIG1: trigrams 1A |
|---|
| 281 | */ |
|---|
| 282 | |
|---|
| 283 | /* alloc/// */ |
|---|
| 284 | for (zirec=maxnrec; zirec--; ) { if (!vrecp[zirec]) /* ja' decrementado */ break; } |
|---|
| 285 | if (zirec<0L) fatal("wtrig2/zirec"); |
|---|
| 286 | recallok(zirec,awtmaxmfrl); |
|---|
| 287 | |
|---|
| 288 | /* alloc working areas |
|---|
| 289 | */ |
|---|
| 290 | trigbuff=loadfile(NULL,'@',"",NULL,awtmaxmfrl+awtmaxmfrl,'\0'); |
|---|
| 291 | |
|---|
| 292 | /* allocate list of relevant terms |
|---|
| 293 | */ |
|---|
| 294 | list1a= (LIST1A *)loadfile(NULL,'@',"",NULL, readmaxtv*sizeof(LIST1A), '\0'); |
|---|
| 295 | |
|---|
| 296 | /* |
|---|
| 297 | */ |
|---|