PEST-find

mathog at seqaxp.bio.caltech.edu mathog at seqaxp.bio.caltech.edu
Wed Apr 17 16:17:00 EST 1996


In article <kai-1404962322490001 at nips087003.nips.ac.jp>, kai at nips.ac.jp (Nobuyuki Kai) writes:
>I'm looking for info on Macintosh software doing automatic 
>serches for PEST regions. Can anyone help me? 

After my signature you will find a translation of the original pestfind
program from BasicA to ANSI C.  If you can find an ANSI C Mac compiler
it should do the trick for you.

Regards,

David Mathog
mathog at seqaxp.bio.caltech.edu
Manager, sequence analysis facility, biology division, Caltech 
**********************************************************************
/*pestfind.c
  1-NOV-1995, David Mathog, Biology Division, Caltech

  Find PEST sequences from a single protein sequence.

  REQUIRED Input format for sequence:  plain sequence on one or more lines.  No
  numbers or spaces should be present.  Sequences longer than 10000 amino
  acids will not run correctly. A terminal "*" is allowed, but not required.
  The only allowed Amino Acid designations are: ARNDCQEGHILKMFPSTWYV
  Ambiguous residues, such as 'X' will cause the program to abort.

  This is an ANSI C version, translated from the original Basic program
  provided by Martin Rechsteiner, and incorporating the corrections 
  provided to him by Bob Stellwagen.  It should be quite portable.
  (It compiles without errors or warnings in DECC 4.1 for /standard=ansi89
  and /standard=portable.)

  Compilation/Use  on OpenVMS:
    $ cc/standard=ansi89/prefix=all pestfind
    $ link pestfind 
    $ run pestfind

  Compilation/Use  on Unix (Irix 5.3, anyway):
    $ cc -xansi -o pestfind pestfind.c
    % pestfind

  For reference purposes, here is the original program:

1 REM PESTFIND
100 ON ERROR GOTO 485
105 CLS:WIDTH 40:KEY OFF:GOTO 330
110 CNT=0:HLD=1
115 FOR Q = HLD TO X
120 IF Q=1 THEN 130 ELSE 125
125 IF A$(Q)="R" OR A$(Q)="K" OR A$(Q)="H" THEN 130 ELSE 320
130 CNT=0: HND=Q :HLD=Q
135 FOR B = 1 TO 23 : R(B) = 0 :NEXT
140 TFD(21) = 0
145 FOR I = (Q+1) TO X
150 IF A$(I)="H" OR A$(I)="K" OR A$(I)="R" THEN 165 ELSE 155
155 CNT = CNT + 1
160 NEXT I
165 IF CNT < WS THEN 170 ELSE 175
170 HLD = I : GOTO 320
175 FOR K = HLD+1 TO I-1
180 FOR N = 1 TO 20
185 IF L$(N) = A$(K) THEN 195 ELSE 190
190 NEXT N
195 R(N)=R(N)+T(N)
200 NEXT K
205 IF R(15) = 0 OR R(4)+R(7) = 0 OR R(16)+R(17) = 0 THEN 210 ELSE 235
210 IF R$ = "N" THEN 230 ELSE 215
215 LPRINT "INVALID PEST SEQUENCE: ";N$;" ";Q;"-";I;"  (WS=";WS;")"
220 FOR B=HND TO I: LPRINT A$(B);:NEXT
225 LPRINT:LPRINT "--------------------------------------------------": LPRINT
230 HLD = I : GOTO 320
235 FOR J = 1 TO 20 : R(21)=R(21)+R(J):NEXT
240 R(22)=(((R(7)+R(15)+R(16)+R(17))-(T(7)+T(15)+T(17)))/R(21))*100
245 R(23)=(((R(4)+R(7)+R(15)+R(16)+R(17))-(T(7)+T(15)+T(17)))/R(21))*100
250 FOR J = 1 TO 20 :TFD(21) = TFD(21)+(TFD(J)*(R(J)/R(21))):NEXT
255 DSC2 = -1*((-.55*R(23))+(.5*TFD(21)))
260 IF DSC2 >0 THEN 265 ELSE 300
265 LPRINT "POTENTIAL PEST SEQUENCE: ";N$;" ";Q;"-";I;"  (WS=";WS;")"
270 FOR B= HND TO I: LPRINT A$(B);" ";:NEXT B
275 LPRINT:LPRINT:LPRINT "THE MOLE FRACTION OF PEDST IS: ";R(23)
280 LPRINT "THE HYDROPHOBICITY INDEX IS ";TFD(21)
285 LPRINT:LPRINT "THE PEST-FIND SCORE IS ";: LPRINT CHR$(14) DSC2
290 LPRINT:LPRINT CHR$(14) "POSSIBLE PEST SEQUENCE":LPRINT "-------------------------------------------------":LPRINT
295 GOTO 315
300 LPRINT "POOR PEST SEQUENCE: ";N$;" ";Q;"-";I;"  (WS=";WS;")"
305 FOR B= HND TO I: LPRINT A$(B);" ";:NEXT B
310 LPRINT:LPRINT "THE PEST-FIND SCORE IS ";: LPRINT CHR$(14) DSC2 :LPRINT "-------------------------------------------------"
315 HLD = HLD + I
320 NEXT Q
325 CLS:LPRINT:LPRINT "END PEST SEARCH OF ";N$:LPRINT:LPRINT:LPRINT: IF D$ = "P" GOTO 435 ELSE RUN
330 DIM R(24): DIM T(22):DIM TFD(23):DIM L$(23): DIM A$(1500)
335 FOR I = 1 TO 20 : READ L$(I):NEXT I
340 FOR I = 1 TO 20 : READ T(I):NEXT I
345 FOR I = 1 TO 20 : READ TFD(I):NEXT
350 PRINT "************** PEST FIND ***************":PRINT:PRINT
355 INPUT "ENTER PROTEIN NAME: ",N$
360 CLS:LOCATE 10,5:INPUT "PRINT INVALID PEST SEQUENCES";R$
365 CLS:PRINT:PRINT:INPUT "WHAT IS THE MINIMUM NUMBER OF AAs       BETWEEN POSITIVE FLANKS";WS
370 PRINT:PRINT:INPUT "ENTRY FROM SCREEN(S) OR FROM PROGRAM(P)";D$
375 IF D$="S" THEN 380 ELSE 435
380 CLS:PRINT:PRINT:PRINT"ENTER ONE LETTER AMINO ACID CODE FOR    SEQUENCE (* TO END)":PRINT:PRINT:PRINT
385 FOR X = 1 TO 1500
390 PRINT"AMINO ACID ";X;" : ";:INPUT A$(X)
395 IF A$(X)="*" THEN GOTO 405 ELSE 400
400 NEXT
405 LPRINT CHR$(14) "PEST SEARCH: ",N$
410 LPRINT "--------------------------------------------------"
415 LET X = X-1:GOTO 110
420 DATA A,R,N,D,C,Q,E,G,H,I,L,K,M,F,P,S,T,W,Y,V
425 DATA 71,156,114,115,103,128,129,57,137,113,113,128,131,147,97,87,101,186,163,99
430 DATA 63,0,10,10,70,10,10,41,13,90,82,6,64,72,29,36,38,36,32,87
435 READ N$
440 CLS:PRINT:PRINT:PRINT:PRINT:PRINT"READING ",N$
445 FOR B=1 TO 1500
450 READ A$(B)
455 IF A$(B)= "*" THEN GOTO 465 ELSE 460
460 NEXT B
465 X = B-1
470 LPRINT CHR$(14) "PEST SEARCH: ",N$
475 LPRINT "--------------------------------------------------"
480 GOTO 110
485 IF ERR=4 THEN RUN

*/
#include  <ctype.h>
#include  <stdlib.h>
#include  <stdio.h>
#ifdef __VMS
#include  <unixio.h>
#endif /* __VMS */
#include  <string.h>
int main(){

/* references to "was something" indicate the corresponding BASIC variable */

int i,j,k,n;           /* assorted loop variables */
int query;             /* loop variable, marks the beginning of a query, was q*/
int last;              /* loop variable, marks the end of a query, was i*/
int piece;             /* length of individual line read from input sequence */
int remaining;         /* bytes free in buffer for input sequence */
int seqlen;            /* length of AA sequence being searched */
int flank_dist;        /* distance between positive flanks, was WS*/
int aanum[256];        /* lookup table, aa_name -> position */
float score[20];       /* pest score, was array R() */
float sum_score;       /* score summation, was R(21) */
float weight_percent;  /* weight percent of PEDST, was R(23) */
float pf_score;        /* PESTfind score, was DSC2 */
float weight[20];      /* weight in grams of 1 mole of this amino acid, was T() */
float phobe[20];       /* normalized hydrophobicity value for this amino acid, was TFD()*/
float phobe_index;     /* hydrophobicity index, was TFD(21) */
char  invalid;         /* if 'y' then print invalid PEST sequences, else no, was N$*/
char  aa_name[20];     /* single letter name for this amino acid, was L$() */
char  sequence[10000]; /* a protein sequence in single letter code, was A$() */
char  line[1000];      /* a place to format output lines */
char  infile[1000];    /* input file name */
char  outfile[1000];   /* output file name */
FILE *ifile,*ofile;    /* file pointer for input file */

/* Initialize the arrays.  Use this long form because it's easier to
   check the value at a given position than in more compact forms.
   Note that the hydrophobicity value for tyrosine used here is 58, not
   32 as in the original program - following a correction from
   Bob Stellwagen (USC).
*/

   for(i=0;i<256;i++)aanum[i] = -1;
      
   i = 0;  aa_name[i]='A';  aanum[(int) 'A']=i;  weight[i]= 71;  phobe[i]=63;
   i = 1;  aa_name[i]='R';  aanum[(int) 'R']=i;  weight[i]=156;  phobe[i]= 0;
   i = 2;  aa_name[i]='N';  aanum[(int) 'N']=i;  weight[i]=114;  phobe[i]=10;
   i = 3;  aa_name[i]='D';  aanum[(int) 'D']=i;  weight[i]=115;  phobe[i]=10;
   i = 4;  aa_name[i]='C';  aanum[(int) 'C']=i;  weight[i]=103;  phobe[i]=70;
   i = 5;  aa_name[i]='Q';  aanum[(int) 'Q']=i;  weight[i]=128;  phobe[i]=10;
   i = 6;  aa_name[i]='E';  aanum[(int) 'E']=i;  weight[i]=129;  phobe[i]=10;
   i = 7;  aa_name[i]='G';  aanum[(int) 'G']=i;  weight[i]= 57;  phobe[i]=41;
   i = 8;  aa_name[i]='H';  aanum[(int) 'H']=i;  weight[i]=137;  phobe[i]=13;
   i = 9;  aa_name[i]='I';  aanum[(int) 'I']=i;  weight[i]=113;  phobe[i]=90;
   i =10;  aa_name[i]='L';  aanum[(int) 'L']=i;  weight[i]=113;  phobe[i]=82;
   i =11;  aa_name[i]='K';  aanum[(int) 'K']=i;  weight[i]=128;  phobe[i]= 6;
   i =12;  aa_name[i]='M';  aanum[(int) 'M']=i;  weight[i]=131;  phobe[i]=64;
   i =13;  aa_name[i]='F';  aanum[(int) 'F']=i;  weight[i]=147;  phobe[i]=72;
   i =14;  aa_name[i]='P';  aanum[(int) 'P']=i;  weight[i]= 97;  phobe[i]=29;
   i =15;  aa_name[i]='S';  aanum[(int) 'S']=i;  weight[i]= 87;  phobe[i]=36;
   i =16;  aa_name[i]='T';  aanum[(int) 'T']=i;  weight[i]=101;  phobe[i]=38;
   i =17;  aa_name[i]='W';  aanum[(int) 'W']=i;  weight[i]=186;  phobe[i]=36;
   i =18;  aa_name[i]='Y';  aanum[(int) 'Y']=i;  weight[i]=163;  phobe[i]=58;
   i =19;  aa_name[i]='V';  aanum[(int) 'V']=i;  weight[i]= 99;  phobe[i]=87;

   fprintf(stdout,"************** PEST FIND ***************\n\n\n");
   fprintf(stdout,"Enter the name of the amino acid sequence file to process: ");
   scanf("%s",&infile[0]);
   fprintf(stdout,"Enter the minimum number of AAs between positive flanks:   ");
   scanf("%d",&flank_dist);
   fprintf(stdout,"Print invalid pest sequences (y/n):                        ");
   scanf("%s",&invalid);
   invalid=tolower(invalid);
   fprintf(stdout,"Enter a name for the output file:                          ");
   scanf("%s",&outfile[0]);
   fprintf(stdout,"\n\n");

/* Read in the sequence file, if possible.
   Put an "^" on each end - that will be used to enforce the PEST
   definition that N and C termini act as begin/end for PEST sequences. */

   ofile = fopen(outfile,"w");
   if(!ofile){
      fprintf(stderr,"Fatal error: the file >%s< could not be created!\n",outfile);
      fprintf(stderr,"PESTFIND run aborted\n");
      exit(0);
      }

   ifile = fopen(infile,"r");
   if(!ifile){
      fprintf(stderr,"Fatal error: the file >%s< does not exist or is unreadable!\n"
         ,infile);
      fprintf(stderr,"PESTFIND run aborted\n");
      exit(0);
      }
   else{
      remaining=9999;
      seqlen=1;
      sequence[0]='^';
      while(fgets(&sequence[seqlen],remaining,ifile)){
         piece = strlen(&sequence[seqlen]);
         seqlen = seqlen + piece - 1;
         remaining = remaining - piece;
         }
      }
/*  Remove trailing "*" and any spaces after it, if present.  Add a
    trailing '^' to enforce the N/C rule. */
   for(i=0;i<seqlen;i++){
      if(sequence[i] == '*'){
         sequence[i]='^';
         sequence[i+1]='\0';
         seqlen = i+1;
         break;
         }
      }
      if(sequence[seqlen-1] != '^'){
         sequence[seqlen] = '^';
         seqlen++;
         }

/* Convert lower to uppper, and flag anything else wrong with this sequence*/

   for(i=0;i<seqlen;i++){
      sequence[i]=toupper(sequence[i]);
      for(j=0;j<20;j++){if(sequence[i] == aa_name[j])break;}
      if(j == 20 && i != 0 && i != seqlen-1){
         fprintf(stderr,"Fatal error: the file >%s< is invalid!\n",infile);
         fprintf(stderr,"Reason: >%c< is not in the single letter AA code\n"
                ,sequence[i]);
         fprintf(stderr,"PESTFIND run aborted\n");
         exit(0);
      }        
   }

/* Time for the actual calculation */

/* First find the beginning of the possible PEST region, defined as the
   N terminus or an R, K, or H amino acid.  The beginning will be 
   marked by the variable "query" */

   query=0;
   while(query<seqlen){
      if(  sequence[query] == '^' || 
           sequence[query] == 'R' || 
           sequence[query] == 'K' || 
           sequence[query] == 'H' ){
         }
      else{
         continue;   /* on query loop */
         }


      for(last=query+1;last<seqlen;last++){
         if(  sequence[last] == '^' || 
              sequence[last] == 'R' || 
              sequence[last] == 'K' || 
              sequence[last] == 'H')break;
         }    /*  loop on last*/

      if(last - query + 1 < flank_dist){
         query = last;
         continue;   /* to bottom of query loop*/
         }
      else{

/* initialize the accumulators */

         for(j=0;j<20;j++)score[j]=0;
         phobe_index=0;
         sum_score=0;

/* Add up occupancy * weight for each type of AA in this region*/

         for(k=query+1;k<last;k++){
            for(j=0;j<20;j++){
               if(aa_name[j] == sequence[k]){
                  score[j] = score[j] + weight[j];
                  break;
                  }
               } /* loop on j */
            }    /* loop on k */

         if(   (score[aanum[(int) 'P']] == 0)
            || (score[aanum[(int) 'D']] + score[aanum[(int) 'E']] == 0)
            || (score[aanum[(int) 'S']] + score[aanum[(int) 'T']] == 0)){
            if(invalid == 'y'){
               fprintf(ofile,"Invalid PEST sequence %d-%d (flank_dist=%d)\n",
                       query,last,last-query-1);
               strncpy(line,&sequence[query],last-query+1);
               line[last-query+1] = '\0';
               fprintf(ofile,"%s\n",line);
               fprintf(ofile,"---------------------------------\n\n");
               }
	    query = last;
            continue;  /* to the bottom of the query loop*/
            }
         else{
	    for(j=0;j<20;j++)sum_score=sum_score + score[j];
            weight_percent =(100.0 / sum_score) *
                      (( score[aanum[(int) 'D']]   + 
                         score[aanum[(int) 'E']]   +
                         score[aanum[(int) 'T']]   +
                         score[aanum[(int) 'S']]   +
                         score[aanum[(int) 'P']] )
                                       -
                       ( weight[aanum[(int) 'E']] +
                         weight[aanum[(int) 'P']] +
                         weight[aanum[(int) 'T']] ));

/* Presumably the original point of subtracting these was to remove them
because they were required, ie, there must have been a P, E/D or S/T to
have reached this point.  Still, it should really check for E vs. D or S
vs. T and subtracted the right one! Similarly, the three required ones
should not go into sum_score, but they do, which artificially lowers the
weight percent value.  But doing any of this would change the values 
reported by the C version relative to the Basic version.*/ 

	    for(j=0;j<20;j++)phobe_index = phobe_index +
                        (phobe[j] * score[j] / sum_score);
            pf_score = (0.55 * weight_percent) - (0.5 * phobe_index);
            if(pf_score > 0){
               fprintf(ofile,"Potential PEST sequence %d-%d (flank_dist=%d)\n",
                       query,last,last-query-1);
               strncpy(line,&sequence[query],last-query+1);
               line[last-query+1] = '\0';
               fprintf(ofile,"  %s\n",line);
               fprintf(ofile,"  The weight percent of PEDST is: %f\n",weight_percent);
               fprintf(ofile,"  The hydrophobicity index is: %f\n",phobe_index);
               fprintf(ofile,"  The PEST-FIND score is: %f\n",pf_score);
               fprintf(ofile,"---------------------------------\n\n");
               }
            else {
               fprintf(ofile,"Poor PEST sequence %d-%d (flank_dist=%d)\n",
                       query,last,last-query-1);
               strncpy(line,&sequence[query],last-query+1);
               line[last-query+1] = '\0';
               fprintf(ofile,"  %s\n",line);
               fprintf(ofile,"  The best PEST-FIND score is: %f\n",pf_score);
               fprintf(ofile,"---------------------------------\n\n");
               }
            query = last;
            continue;  /* to bottom of query loop */
            }
         query++;
         }
      }          /* loop on query */
      fprintf(stderr,"PESTFIND run completed normally\n");
}





More information about the Bio-soft mailing list