C code for Von Heijne signal sequence algorithm

Robert C. Colgrove robin at cco.caltech.edu
Wed Dec 15 12:42:34 EST 1993


Hi bionetters.
I got so many requests for the signal sequence program I mentioned
that I will repost it. Feel free to use, distribute or archive. Just
send me email if you do and cite the program as, "Colgrove, RC, PhD
Dissertation, UCSF dept of Biochemistry, 1990."

This program implements the Von Heijne algorithm to find potential signal
sequences (reference is at the end of the code). It does a very good job
at finding real signals, even internal and atypical signals. I have used it
to:
	-provide evidence that some newly cloned gene encode a secreted protein
	-identify weak of unusual internal signal sequences
	-identify mutations that would likely affect secretion 
	-examine the statistical distribution of signal sequences in real
		protein sequences and in random sequences.

The program with called with no arguments prints the matrices used and the
Von Heijne reference. With one argument, it reads that argument as a file
name containing a one-letter amino acid code and prints to the standard 
output one line for each residue showing the residue number, the one-letter
code, and a score for how likely a peptidase site that position would be.
At the last line, it prints the position of the best site and its score and
tells whether it is a likely signal. To plot a graph of the scores, calling
the program with an extra dummy argument (eg signal filename dummyarg) produces
output that can be piped to UNIX plot and graph. For example the script:

signal $1 s > infile
graph -l "signal sequence profile for "$1 < infile | plot -T4014 > outfile
cat outfile
rm infile outfile

makes a nice graph of the signal sequence profile ofthe filename given in $1.
I have also included the code for a variant of the program, signal.filtered,
which prints a zero score for all poor signal positions. This produces a much
prettier graph where only the signals show up as clean spikes.
	I have only run these programs on UNIX machine but the C code should
be portable to anything. Good luck and by all means let me know if you find
these useful.

-robin
RC Colgrove
colgrove at xtal10.harvard.edu

::::::::::::::::::::::::::::::::::::::::::::::::
signal.c
:::::::::::::::::::::::::::::::::::::::::::::::

#include <stdio.h>
#include <math.h>
#define aminotype "ACDEFGHIKLMNPQRSTVWY"
#define SAMPLE 450
#define seqfile argv[1]
float expect[] = {14.5,4.5,8.9,10.0,5.6,12.1,3.4,7.4,11.3,12.1,
                  2.7,7.1,7.4,6.3,7.6,11.4,9.7,11.1,1.8,5.6};
float freq[20][15] =
        {       16,13,14,15,20,18,18,17,25,15,47,6,80,18,6,
                3,6,9,7,9,14,6,8,5,6,19,3,9,8,3,
                0,0,0,0,0,0,0,0,5,3,0,5,0,10,11,
                0,0,0,1,0,0,0,0,3,7,0,7,0,13,14,
                13,9,11,11,6,7,18,13,4,5,0,13,0,6,4,
                4,4,3,6,3,13,3,2,19,34,5,7,39,10,7,
                0,0,0,0,0,1,1,0,5,0,0,6,0,4,2,
                15,15,8,6,11,5,4,8,5,1,10,5,0,8,7,
                0,0,0,1,0,0,1,0,0,4,0,2,0,11,9,
                71,68,72,79,78,45,64,49,10,23,8,20,1,8,4,
                0,3,7,4,1,6,2,2,0,0,0,1,0,1,2,
                0,1,0,1,1,0,0,0,3,3,0,10,0,4,7,
                2,0,2,0,0,4,1,8,20,14,0,1,3,0,22,
                0,0,0,1,0,6,1,0,10,8,0,18,3,19,10,
                2,0,0,0,0,1,0,0,7,4,0,15,0,12,9,
                9,3,8,6,13,10,15,16,26,11,23,17,20,15,10,
                2,10,5,4,5,13,7,7,12,6,17,8,6,3,10,
                20,25,15,18,13,15,11,27,0,12,32,3,0,8,17,
                4,3,3,1,1,2,5,3,1,3,0,9,0,2,0,
                0,1,4,0,0,1,3,1,1,2,0,5,0,1,7   };
main(argc,argv)
int argc;
char *argv[];
{
        int i,j,k,shift;
        float weight[20][15],score[1000],max;
        FILE *fp, *fopen();
        char c, seqbuff[15];
        if(argc==1)
        {
                printf("\n\t\tSIGNAL SEQUENCE FINDER\n");
                printf("\n\nFREQUENCY MATRIX\n\n");
                printf("\t");
                for(i=0;i<16;i++)
                {
                        if(i>3) printf(" ");
                        if(i>12) printf(" ");
                        if(i==13) i++;
                        printf("%d ",i-13);
                }
                printf("\tEXPECT\n\n");
                for(j=0;j<20;j++)
                {
                        printf("%c\t",aminotype[j]);
                        for(i=0;i<15;i++) printf(" %2.0f ",freq[j][i]);
                        printf("\t%4.1f\n",expect[j]);
                }
        }
        for(j=0;j<20;j++)
        {
                for(i=0;i<15;i++)
                {
                        if(freq[j][i]==0)
                        {
                                freq[j][i]=1;
                                if(i==10||i==12) freq[j][i]=expect[j]/SAMPLE;
                        }
                        weight[j][i]=log(freq[j][i]/expect[j]);
                }
        }
        if(argc==1)
        {
                printf("\n\nWEIGHT MATRIX (x 100)\n\n");
                printf("  ");
                for(i=0;i<16;i++)
                {
                        if(i>3) printf(" ");
                        if(i>12) printf(" ");
                        if(i==13) i++;
                        printf("  %d",i-13);
                }
                printf("\n\n");
                for(j=0;j<20;j++)
                {
                        printf("%c  ",aminotype[j]);
                        for(i=0;i<15;i++) printf("%4.0f ",100*(weight[j][i]));
                        printf("\n");
                }
           printf("\nmethod from: Gunnar von Heijne, NAR 14, p.4683, 1986\n");
        }
        if(argc!=1)
        {
                if((fp=fopen(seqfile,"r"))==NULL) printf("no such file\n");
                if(argc==2)
        printf("\n\nSCORE PROFILE of %s\n\ncleave\tscore\tresidue\n\n",seqfile);
                while((c=getc(fp))!=EOF&&c!=NULL)
                {
                        if(c==' '||c=='\t'||c=='\n') continue;
                        seqbuff[14]=c;
                        for(i=0;i<15;i++)
                         for(j=0;j<20;j++)
                          if(seqbuff[i]==aminotype[j]) score[k] += weight[j][i];
                         if(score[k]>max)
                        {
                                max=score[k];
                                shift=k;
                        }
                        printf("%d\t%4.2f",k-1,score[k]);
                        if(argc==2) printf("\t%c",seqbuff[12]);
                        printf("\n");
                        for(i=0;i<14;i++) seqbuff[i]=seqbuff[i+1];
                        k++;
                }
        if(argc==2)
          {
           printf("\nmax score is %4.2f, cleaved after res %d\n\n",max,shift-1);
                 if(max<0.0) printf("improbable signal sequence\n");
           if(max>=0.0&&max<5.0) printf("possible but ambiguous signal\n");
           if(max>=5.0&&max<10.0) printf("probable signal sequence\n");
           if(max>=10.0) printf("highly probable signal sequence\n");
          }
        }
}


::::::::::::::::::::::::::::::::::::::::::::::::::::::;;;
signal.filtered.c
:::::::::::::::::::::::::::::::::::::::::::::::::::::::: 


#include <stdio.h>
#include <math.h>
#define aminotype "ACDEFGHIKLMNPQRSTVWY"
#define SAMPLE 450
#define seqfile argv[1]
float expect[] = {14.5,4.5,8.9,10.0,5.6,12.1,3.4,7.4,11.3,12.1,
                  2.7,7.1,7.4,6.3,7.6,11.4,9.7,11.1,1.8,5.6};
float freq[20][15] =
        {       16,13,14,15,20,18,18,17,25,15,47,6,80,18,6,
                3,6,9,7,9,14,6,8,5,6,19,3,9,8,3,
                0,0,0,0,0,0,0,0,5,3,0,5,0,10,11,
                0,0,0,1,0,0,0,0,3,7,0,7,0,13,14,
                13,9,11,11,6,7,18,13,4,5,0,13,0,6,4,
                4,4,3,6,3,13,3,2,19,34,5,7,39,10,7,
                0,0,0,0,0,1,1,0,5,0,0,6,0,4,2,
                15,15,8,6,11,5,4,8,5,1,10,5,0,8,7,
                0,0,0,1,0,0,1,0,0,4,0,2,0,11,9,
                71,68,72,79,78,45,64,49,10,23,8,20,1,8,4,
                0,3,7,4,1,6,2,2,0,0,0,1,0,1,2,
                0,1,0,1,1,0,0,0,3,3,0,10,0,4,7,
                2,0,2,0,0,4,1,8,20,14,0,1,3,0,22,
                0,0,0,1,0,6,1,0,10,8,0,18,3,19,10,
                2,0,0,0,0,1,0,0,7,4,0,15,0,12,9,
                9,3,8,6,13,10,15,16,26,11,23,17,20,15,10,
                2,10,5,4,5,13,7,7,12,6,17,8,6,3,10,
                20,25,15,18,13,15,11,27,0,12,32,3,0,8,17,
                4,3,3,1,1,2,5,3,1,3,0,9,0,2,0,
                0,1,4,0,0,1,3,1,1,2,0,5,0,1,7   };
main(argc,argv)
int argc;
char *argv[];
{
        int i,j,k,shift;
        float weight[20][15],score[1000],max;
        FILE *fp, *fopen();
        char c, seqbuff[15];
        if(argc==1)
        {
                printf("\n\t\tSIGNAL SEQUENCE FINDER\n");
                printf("\n\nFREQUENCY MATRIX\n\n");
                printf("\t");
                for(i=0;i<16;i++)
                {
                        if(i>3) printf(" ");
                        if(i>12) printf(" ");
                        if(i==13) i++;
                        printf("%d ",i-13);
                }
                printf("\tEXPECT\n\n");
                for(j=0;j<20;j++)
                {
                        printf("%c\t",aminotype[j]);
                        for(i=0;i<15;i++) printf(" %2.0f ",freq[j][i]);
                        printf("\t%4.1f\n",expect[j]);
                }
        }
        for(j=0;j<20;j++)
        {
                for(i=0;i<15;i++)
                {
                        if(freq[j][i]==0)
                        {
                                freq[j][i]=1;
                                if(i==10||i==12) freq[j][i]=expect[j]/SAMPLE;
                        }
                        weight[j][i]=log(freq[j][i]/expect[j]);
                }
        }
        if(argc==1)
        {
                printf("\n\nWEIGHT MATRIX (x 100)\n\n");
                printf("  ");
                for(i=0;i<16;i++)
                {
                        if(i>3) printf(" ");
                        if(i>12) printf(" ");
                        if(i==13) i++;
                        printf("  %d",i-13);
                }
                printf("\n\n");
                for(j=0;j<20;j++)
                {
                        printf("%c  ",aminotype[j]);
                        for(i=0;i<15;i++) printf("%4.0f ",100*(weight[j][i]));
                        printf("\n");
                }
           printf("\nmethod from: Gunnar von Heijne, NAR 14, p.4683, 1986\n");
        }
        if(argc!=1)
        {
                if((fp=fopen(seqfile,"r"))==NULL) printf("no such file\n");
                if(argc==2)
        printf("\n\nSCORE PROFILE of %s\n\ncleave\tscore\tresidue\n\n",seqfile);
                while((c=getc(fp))!=EOF&&c!=NULL)
                {
                        if(c==' '||c=='\t'||c=='\n') continue;
                        seqbuff[14]=c;
                        for(i=0;i<15;i++)
                         for(j=0;j<20;j++)
                          if(seqbuff[i]==aminotype[j]) score[k] += weight[j][i];
                        if(score[k]>max)
                        {
                                max=score[k];
                                shift=k;
                        }
                        if(k>1)
                        {
                          if(argc>2&&score[k]<0)
                                printf("%d\t0",k-1);
                          else
                          {
                                if(argc>2) printf("%4.2f\t0\n",k-1.25);
                                printf("%d\t%4.2f", k-1, score[k]);
                                if(argc>2) printf("\n%4.2f\t0",k-0.75);
                          }
                        }
                        if(argc==2) printf("\t%c",seqbuff[12]);
                        printf("\n");
                        for(i=0;i<14;i++) seqbuff[i]=seqbuff[i+1];
                        k++;
                }
          if(argc==2)
          {
           printf("\nmax score is %4.2f, cleaved after res %d\n\n",max,shift-1);
                    if(max<0.0) printf("improbable signal sequence\n");
           if(max>=0.0&&max<5.0) printf("possible but ambiguous signal\n");
           if(max>=5.0&&max<10.0) printf("probable signal sequence\n");
           if(max>=10.0) printf("highly probable signal sequence\n");
          }
          else
          {
           printf("%d\t0\t\" \"\n",k-1);
           printf("%d\t%4.2f\t\<res%d,score%4.2f\>\n",shift-1,max,shift-1,max);
           printf("%d\t2.5\tpossible\n",k);
           printf("%d\t7.5\tprobable\n",k);
           printf("%d\t12.5\tdefinite\n",k);
          }
        }
}




More information about the Bio-soft mailing list