| 1 | /* ==----------------------------- w21.c -----------------------------== */ |
|---|
| 2 | |
|---|
| 3 | |
|---|
| 4 | int loadtrig (int cmd, WTFUN_ARRAY *awtfp, int coll, char *text, int textlen, float parmmaxrf ) |
|---|
| 5 | { |
|---|
| 6 | WTFUN_XWT *xwtp;//=awtfp->vwtfp[coll]; |
|---|
| 7 | |
|---|
| 8 | RECSTRU *recp; |
|---|
| 9 | LONGX docmfn=1L; |
|---|
| 10 | |
|---|
| 11 | int LIST1A_DOCVEC=1; |
|---|
| 12 | |
|---|
| 13 | LONGX hidx; |
|---|
| 14 | char *h; |
|---|
| 15 | int found; |
|---|
| 16 | |
|---|
| 17 | FFI loop; |
|---|
| 18 | DIRSTRU *dirp; |
|---|
| 19 | int xdir; |
|---|
| 20 | char *fldp; |
|---|
| 21 | int len; |
|---|
| 22 | |
|---|
| 23 | int parmtrace=cmd; |
|---|
| 24 | int parmtrace2=cmd; |
|---|
| 25 | int parmdebug=0; |
|---|
| 26 | int parmdebug2=0; |
|---|
| 27 | |
|---|
| 28 | LONGX tell=0; |
|---|
| 29 | |
|---|
| 30 | |
|---|
| 31 | LONGX countok=0; |
|---|
| 32 | LONGX xcount=0; |
|---|
| 33 | LONGX ycount=0; |
|---|
| 34 | |
|---|
| 35 | LIST1A *hdr,*tail,*avail,*l; |
|---|
| 36 | |
|---|
| 37 | LONGX zcount,zcount2; |
|---|
| 38 | int *docump; |
|---|
| 39 | |
|---|
| 40 | float soma2,fator,invsqrts2; |
|---|
| 41 | float *prodp; |
|---|
| 42 | |
|---|
| 43 | |
|---|
| 44 | //WTFUN_XWT current |
|---|
| 45 | xwtp=awtfp->vwtfp[coll]; |
|---|
| 46 | |
|---|
| 47 | /* add text |
|---|
| 48 | */ |
|---|
| 49 | RECORD(zirec,"null",1L); /*MFRstatus=ACTIVE;*/ |
|---|
| 50 | sprintf(trigbuff,"D*H1 %"_LD_" ",(LONGX)textlen); |
|---|
| 51 | len=strlen(trigbuff); |
|---|
| 52 | memcpy(trigbuff+len,text,textlen); |
|---|
| 53 | trigbuff[len+textlen]='\0'; |
|---|
| 54 | if (fldupdat(zirec,trigbuff)) fatal("loadtrig/proc add field"); |
|---|
| 55 | // prtleader(recp,1L); |
|---|
| 56 | if (parmdebug) prtfields(recp,1L); |
|---|
| 57 | |
|---|
| 58 | /* extract via Gsplit=tag={trigrams|words} |
|---|
| 59 | */ |
|---|
| 60 | strcpy(trigbuff,"Gsplit=1=trigrams"); |
|---|
| 61 | //printf("+1+ ntrms=%"_LD_"\n",ntrms); |
|---|
| 62 | if (fldupdat(zirec,trigbuff)) fatal("loadtrig/Gsplit=1=trigrams"); |
|---|
| 63 | //printf("+2+ ntrms=%"_LD_"\n",ntrms); |
|---|
| 64 | // prtleader(recp,1L); |
|---|
| 65 | if (parmdebug) prtfields(recp,1L); |
|---|
| 66 | |
|---|
| 67 | |
|---|
| 68 | /* setup list of relevant terms |
|---|
| 69 | */ |
|---|
| 70 | memset(list1a,0x00,readmaxtv*sizeof(LIST1A)); |
|---|
| 71 | for (l=list1a, avail=list1a+1, loop=0; (LONGX)loop<(readmaxtv-1); loop++, l++, avail++) l->avail=avail; |
|---|
| 72 | hdr=tail=avail=list1a; |
|---|
| 73 | zcount=zcount2=0; |
|---|
| 74 | |
|---|
| 75 | /* zero and build f(t) |
|---|
| 76 | */ |
|---|
| 77 | memset(DOCUMENT,0x00,readnterms*sizeof(int)); |
|---|
| 78 | for (dirp=MFRdir, xdir=0, loop=MFRnvf; loop--; dirp++, xdir++) { |
|---|
| 79 | //FFI ucloop; |
|---|
| 80 | //unsigned char *p; |
|---|
| 81 | if (dirp->tag != 1) continue; |
|---|
| 82 | fldp=FIELDP(xdir); |
|---|
| 83 | /* convert words to upper case */ |
|---|
| 84 | //if (!trigrams) if (parmuppercase) for (p=(unsigned char *)fldp, ucloop=dirp->len; ucloop--; p++) *p=isisuctab[*p]; |
|---|
| 85 | |
|---|
| 86 | /* search term in collection via bsrchindex */ |
|---|
| 87 | hidx=bsrchindex(table,tabentries,readwidth,fldp,dirp->len,&found); |
|---|
| 88 | if (parmdebug) { |
|---|
| 89 | printf(" .|%"_LD_"|%d|%d|",docmfn,xdir,dirp->len); |
|---|
| 90 | for (len=0; (FFI)len<dirp->len; len++) printf("%c",fldp[len]); |
|---|
| 91 | printf("|%d|%"_LD_"\n",found,hidx); |
|---|
| 92 | } |
|---|
| 93 | |
|---|
| 94 | /* skip absent terms in collection */ |
|---|
| 95 | if (!found) continue; |
|---|
| 96 | #if 1 //DISCART |
|---|
| 97 | /* optimize term processing */ |
|---|
| 98 | if (parmmaxrf != 1.0) { |
|---|
| 99 | if ((float)COLLECTION[hidx]/COLLECTION_SIZE > parmmaxrf) continue; |
|---|
| 100 | } |
|---|
| 101 | #endif |
|---|
| 102 | /* count term */ |
|---|
| 103 | DOCUMENT[hidx]++; |
|---|
| 104 | } |
|---|
| 105 | |
|---|
| 106 | /* calculate localweight x globalweight |
|---|
| 107 | */ |
|---|
| 108 | memset(PRODW,0x00,readnterms*sizeof(float)); |
|---|
| 109 | soma2=0; |
|---|
| 110 | for (docump=DOCUMENT, h=table, hidx=0; hidx < tabentries; hidx++, h+=hwidth, docump++) { |
|---|
| 111 | if (*h) if (*docump) { |
|---|
| 112 | double logarg = (double)(1 + *docump); |
|---|
| 113 | float localweight; /* l(t) - trigram local weight */ |
|---|
| 114 | localweight = (float)log(logarg); |
|---|
| 115 | PRODW[hidx] = localweight*GLOBALW[hidx]; |
|---|
| 116 | fator=PRODW[hidx]; |
|---|
| 117 | soma2+=fator*fator; |
|---|
| 118 | if (parmtrace & 0x08) printf(" 8|%"_LD_"|%s|%d\n",docmfn,h,*docump); |
|---|
| 119 | if (parmtrace & 0x10) printf("16|%"_LD_"|%s|%f\n",docmfn,h,localweight); |
|---|
| 120 | if (parmtrace & 0x20) printf("32|%"_LD_"|%s|%f\n",docmfn,h,PRODW[hidx]); |
|---|
| 121 | } |
|---|
| 122 | } |
|---|
| 123 | |
|---|
| 124 | /* calculate normalization factor = sqrt of sum of squares |
|---|
| 125 | */ |
|---|
| 126 | if (soma2) invsqrts2=1/sqrt(soma2); |
|---|
| 127 | if (parmtrace & 0x40) printf("64|%"_LD_"| |%f|%f\n",docmfn,soma2,invsqrts2); |
|---|
| 128 | |
|---|
| 129 | /* skip empty vectors */ |
|---|
| 130 | if (!soma2) { |
|---|
| 131 | ycount++; |
|---|
| 132 | // continue; |
|---|
| 133 | return 0; |
|---|
| 134 | } |
|---|
| 135 | |
|---|
| 136 | /* calculate final term weight = local x global x normalization |
|---|
| 137 | */ |
|---|
| 138 | for (prodp=PRODW, h=table, hidx=0; hidx < readnterms; hidx++, h+=hwidth, prodp++) { |
|---|
| 139 | if (*h) if (*prodp) { |
|---|
| 140 | LIST1A *l,*prev; |
|---|
| 141 | float nextw; |
|---|
| 142 | float prodw= *prodp; |
|---|
| 143 | float termweight; /* w(t) - trigram normalized weight */ |
|---|
| 144 | termweight = prodw * invsqrts2; |
|---|
| 145 | prev=NULL; |
|---|
| 146 | l=NULL; if (hdr->weight) l=hdr; |
|---|
| 147 | while (l) { |
|---|
| 148 | if (termweight < l->weight) { prev=l; l=l->next; } |
|---|
| 149 | else break; |
|---|
| 150 | } |
|---|
| 151 | nextw=((l)?l->weight:0.0); |
|---|
| 152 | if (avail) { |
|---|
| 153 | LIST1A *nextavail=avail->avail; |
|---|
| 154 | avail->weight=termweight; avail->hidx=hidx; |
|---|
| 155 | avail->prev=prev; |
|---|
| 156 | avail->next=l; |
|---|
| 157 | if (prev) prev->next=avail; |
|---|
| 158 | if (nextw) l->prev=avail; |
|---|
| 159 | if (!avail->prev) hdr=avail; |
|---|
| 160 | if (!avail->next) tail=avail; |
|---|
| 161 | avail=nextavail; |
|---|
| 162 | if (parmtrace2) printf("64a|"); |
|---|
| 163 | zcount++; |
|---|
| 164 | } |
|---|
| 165 | else { |
|---|
| 166 | if (termweight > tail->weight) { |
|---|
| 167 | LIST1A *last=tail; |
|---|
| 168 | LIST1A *repl=last; |
|---|
| 169 | LIST1A *lnext=NULL; |
|---|
| 170 | if (l) lnext=l->next; |
|---|
| 171 | if (last->prev) { |
|---|
| 172 | tail=last->prev; |
|---|
| 173 | tail->next=NULL; |
|---|
| 174 | } |
|---|
| 175 | repl->weight=termweight; repl->hidx=hidx; |
|---|
| 176 | repl->prev=prev; |
|---|
| 177 | if (prev) prev->next=repl; |
|---|
| 178 | if (l) if (lnext) { |
|---|
| 179 | repl->next=l; |
|---|
| 180 | l->prev=repl; |
|---|
| 181 | } |
|---|
| 182 | if (!repl->prev) hdr=repl; |
|---|
| 183 | if (!repl->next) tail=repl; |
|---|
| 184 | if (parmtrace2) printf("64b|"); |
|---|
| 185 | zcount2++; |
|---|
| 186 | } |
|---|
| 187 | } |
|---|
| 188 | if (parmtrace2) { |
|---|
| 189 | printf("%"_LD_"|%s|%f\n",docmfn,h,termweight); |
|---|
| 190 | for (l=hdr; l; l=l->next) { |
|---|
| 191 | char *p; |
|---|
| 192 | if (!l) break; |
|---|
| 193 | p=table+(l->hidx*hwidth); |
|---|
| 194 | printf("%"_LD_"|%"_LD_"=%s|%f p=%"_LD_"|n=%"_LD_"|a=%"_LD_" hdr=%"_LD_"|lst=%"_LD_"|prev=%"_LD_"\n",docmfn,l->hidx,p,l->weight, |
|---|
| 195 | (l->prev)?((LIST1A *)(l->prev))->hidx:-1, |
|---|
| 196 | (l->next)?((LIST1A *)(l->next))->hidx:-1, |
|---|
| 197 | (l->avail)?((LIST1A *)(l->avail))->hidx:-1, |
|---|
| 198 | (hdr->hidx)?hdr->hidx:-1, |
|---|
| 199 | (tail->hidx)?tail->hidx:-1, |
|---|
| 200 | (prev)?prev->hidx:-1); |
|---|
| 201 | } |
|---|
| 202 | printf("\n"); |
|---|
| 203 | } |
|---|
| 204 | } /* end if h */ |
|---|
| 205 | } /* end for hidx */ |
|---|
| 206 | |
|---|
| 207 | if (parmdebug2) printf("+++ 1 +++|\n"); |
|---|
| 208 | /* copy term vector to DOCVEC structure (w2rdoc.c compliance) |
|---|
| 209 | */ |
|---|
| 210 | if (LIST1A_DOCVEC) { |
|---|
| 211 | DOCVEC *dv=docvector; |
|---|
| 212 | docvectcnt=0; |
|---|
| 213 | for (l=hdr; l; l=l->next) { |
|---|
| 214 | if (!l->weight) break; |
|---|
| 215 | hidx=l->hidx; h=table+hidx*hwidth; |
|---|
| 216 | /* list term vector */ |
|---|
| 217 | if (parmtrace & 0x40) printf("64|%"_LD_"|%s|%f\n",docmfn,h,l->weight); |
|---|
| 218 | /* copy term vector */ |
|---|
| 219 | dv->weight=l->weight; |
|---|
| 220 | dv->hidx=l->hidx; |
|---|
| 221 | dv->foundp=table+dv->hidx*hwidth; |
|---|
| 222 | dv->keylen=strlen(dv->foundp); |
|---|
| 223 | memcpy(dv->key,dv->foundp,dv->keylen); dv->key[dv->keylen]='\0'; |
|---|
| 224 | docvectcnt++; dv++; |
|---|
| 225 | } |
|---|
| 226 | } |
|---|
| 227 | if (parmdebug2) printf("+++ 2 +++|\n"); |
|---|
| 228 | |
|---|
| 229 | /* doc done |
|---|
| 230 | */ |
|---|
| 231 | countok++; |
|---|
| 232 | if (tell) if (countok%tell == 0) fprintf(stderr,"++ %"_LD_"+%"_LD_" %"_LD_" %"_LD_"+%"_LD_"\n",countok,xcount,ycount,zcount,zcount2); |
|---|
| 233 | |
|---|
| 234 | return 1; |
|---|
| 235 | |
|---|
| 236 | } /* end of loadtrig */ |
|---|