AA alignments to DNA alignments!?

Bill Pearson wrp at cyclops.micr.Virginia.EDU
Thu Jun 24 08:47:47 EST 1993


>In Message-Id: <9306231808.AA00773 at net.bio.net>
>ahouse at hydra.rose.brandeis.edu (Jeremy John Ahouse) describes a
>problem that I have been complaining about for years.  Since it makes
>so much more biological sense to align amino acids...


>>    I have done a series of multiple alignments.  The alignments
>>were done with inferred amino acid sequences.  Now that I am happy
>>with the alignments I want to go back to the mRNA sequence (which I
>>have) for some of the clustering and parsimony analysis.  I want to
>>enforce the alignments (gaps, etc...) from the aa's on the
>>nucleotide alignment.
>>
>>    I know I can do this by hand, but are there any tools that help with
>>this, that you all know about?

Here is a little program that I call "mrtrans" that will do what you
want.  It expects the aligned protein and cDNA sequences to be in "fasta"
format, e.g.

	>Seqid1 - comment line
	agctagct...
	>Seqid2 - comment line
	agctagct...

This is a unix "shar" file. It is also available for anonymous ftp at
virginia.edu in pub/fasta/mrtrans.shar.

Bill Pearson

================
#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  Makefile aamap.gbl faatran.c mrtrans.1 mrtrans.c
#   uascii.gbl upam.gbl
# Wrapped by wrp at cyclops.micr.Virginia.EDU on Thu Jun 24 09:44:30 1993
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'Makefile' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'Makefile'\"
else
echo shar: Extracting \"'Makefile'\" \(240 characters\)
sed "s/^X//" >'Makefile' <<'END_OF_FILE'
X
XCFLAGS= -O -DBIGMEM
XOBJECTS= mrtrans.o faatran.o
X
Xmrtrans : ${OBJECTS}
X	cc ${CFLAGS} ${OBJECTS} -o mrtrans
X
Xmrtrans.o : mrtrans.c uascii.gbl upam.gbl
X	cc ${CFLAGS} -c mrtrans.c 
X
Xfaatran.o : faatran.c aamap.gbl
X	cc ${CFLAGS} -c faatran.c
X
END_OF_FILE
if test 240 -ne `wc -c <'Makefile'`; then
    echo shar: \"'Makefile'\" unpacked with wrong size!
fi
# end of 'Makefile'
fi
if test -f 'aamap.gbl' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'aamap.gbl'\"
else
echo shar: Extracting \"'aamap.gbl'\" \(422 characters\)
sed "s/^X//" >'aamap.gbl' <<'END_OF_FILE'
X
X/*	aamap.gbl	character and number translations */
X
Xchar aacmap[64]={
X	'K','N','K','N','T','T','T','T','R','S','R','S','I','I','M','I',
X	'Q','H','Q','H','P','P','P','P','R','R','R','R','L','L','L','L',
X	'E','D','E','D','A','A','A','A','G','G','G','G','V','V','V','V',
X	'X','Y','X','Y','S','S','S','S','X','C','W','C','L','F','L','F'
X	};
X
Xint aamap[64];	/* integer aa values */
Xint aamapr[64]; /* reverse sequence map */
X
X
END_OF_FILE
if test 422 -ne `wc -c <'aamap.gbl'`; then
    echo shar: \"'aamap.gbl'\" unpacked with wrong size!
fi
# end of 'aamap.gbl'
fi
if test -f 'faatran.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'faatran.c'\"
else
echo shar: Extracting \"'faatran.c'\" \(1287 characters\)
sed "s/^X//" >'faatran.c' <<'END_OF_FILE'
X/*	aatran.c	translates from nt to aa, 1 char codes */
X/*	modified July 2, 1987 for all 6 frames */
X/*	23 Jan 1991	fixed bug for short sequences */
X
X/* 	this mapping is not alphabet independent */
X
X#include "aamap.gbl"
X#define XTERNAL
X#include "upam.gbl"
X#include "uascii.gbl"
X
X/* tnt is used only by aatran.c. It must be consistent with lascii and
Xthe nt alphabet. It uses 3,3 because T and U are considered separately
X*/
Xint tnt[]={0,1,2,3,3,0,1,0,0,1,2,0,0,0,1,0,0};
X
Xaatran(ntseq,aaseq,maxs,frame)
X	char *ntseq, *aaseq;
X	int maxs, frame;
X{
X	int iaa, im, nna;
X	register int *nnp;
X	register char *nts0;
X	register int *aamp;
X	register char *aap;
X
X	iaa=nna=(maxs-frame)/3;
X	if (nna <= 0 ) {
X	  aaseq[0]=EOSEQ;
X	  return 0;
X	}
X
X	nnp = tnt;
X	if (frame < 3) {
X		aamp = aamap;
X		nts0 = &ntseq[frame];
X		aap = aaseq;
X		while (nna--) {
X			im = nnp[*nts0++]<<4;
X			im += nnp[*nts0++]<<2;
X			im += nnp[*nts0++];
X			*aap++ = aamp[im];
X			}
X		}
X	else {
X		aamp = aamapr;
X		nts0 = &ntseq[maxs-(frame-3)];
X		aap = aaseq;
X		while (nna--) {
X			im = nnp[*--nts0]<<4;
X			im += nnp[*--nts0]<<2;
X			im += nnp[*--nts0];
X			*aap++ = aamp[im];
X			}
X		}
X	aaseq[iaa]=EOSEQ;
X	return iaa;
X	}
X
X
Xaainit()
X{
X	int i,j;
X
X	for (i=0; i<64; i++) {
X		aamap[i]=aascii[aacmap[i]];
X		aamapr[i]=aascii[aacmap[(~i)&63]];
X		}
X	}
END_OF_FILE
if test 1287 -ne `wc -c <'faatran.c'`; then
    echo shar: \"'faatran.c'\" unpacked with wrong size!
fi
# end of 'faatran.c'
fi
if test -f 'mrtrans.1' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'mrtrans.1'\"
else
echo shar: Extracting \"'mrtrans.1'\" \(1488 characters\)
sed "s/^X//" >'mrtrans.1' <<'END_OF_FILE'
X.TH MRTRANS 1 local
X.SH NAME
X.B mrtrans
X\- produce align cDNA sequences from aligned protein sequences
X
X.SH SYNOPSIS
X.B mrtrans
Xprotein-sequence-library cDNA-sequence-library > aligned-cDNA-sequences
X
X.SH DESCRIPTION
X.B mrtrans
Xis a simple program that allows you to produce aligned cDNA sequences from
Xaligned protein sequences.  This can be very useful for phylogeny programs,
Xe.g. in PHYLIP \- dnadist, dnapars, dnaml, etc.  In general, it is
Xbetter to use protein sequences for multiple alignments, but to use
XDNA sequences for phylogeny.  This can be time consuming when there
Xare gaps in the aligned protein sequences.
X.PP
X.B mrtrans
Xtakes a protein sequence library and a DNA sequence library.  It reads
Xthe first protein sequence and the first DNA sequence, translates the
XDNA sequence in each of the three frames, compares the protein
Xsequence to the translated DNA sequence to find the protein coding
Xregion, and then writes out the DNA sequence that encoded the protein.
XBoth libraries should be in Pearson/FASTA format. The sequences must
Xbe in the same order in both libraries.  The protein library may
Xinclude '-' characters to specify alignments.  Each '-' character in
Xthe protein library is ignored during the sequence comparison but
Xreplaced by '---' in the DNA sequence output.
X.PP
X.B
Xmrtrans
Xfinds the coding regions for contiguous sequences only.  It will not
Xsplice together different exons to produce a coding sequence.
X.SH AUTHOR
XBill Pearson
X.br
Xwrp at virginia.EDU
END_OF_FILE
if test 1488 -ne `wc -c <'mrtrans.1'`; then
    echo shar: \"'mrtrans.1'\" unpacked with wrong size!
fi
# end of 'mrtrans.1'
fi
if test -f 'mrtrans.c' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'mrtrans.c'\"
else
echo shar: Extracting \"'mrtrans.c'\" \(4152 characters\)
sed "s/^X//" >'mrtrans.c' <<'END_OF_FILE'
X/* mrtrans.c	copyright (c) 1992 William R. Pearson
X
X   mrtrans takes a file of non-interleaved aligned protein sequences
X   and the cDNA sequences from which they were derived and constructs
X   the multiply aligned DNA sequences
X
X   usage: mrtrans aligned-protein-file cDNA-sequence-file > aligned-cdna file
X*/
X
X#include <stdio.h>
X#include <stdlib.h>
X#include <string.h>
X
X#ifdef BIGMEM
X#define MAXAA 10000
X#define MAXNT 40000	/* MAXNT > 3*MAXAA to allow gaps */
X#else
X#define MAXAA 1000
X#define MAXNT 4000	/* MAXNT > 3*MAXAA to allow gaps */
X#endif
X
X#include "uascii.gbl"
X#include "upam.gbl"
X
Xmain(argc,argv)
X  int argc; char **argv;
X{
X  char pfname[80], dfname[80], ofname[80];
X  char *bp, *aa0, *nt0, *nt1;
X  int na, nn, no, n, i;
X  FILE *pfile, *dfile;
X  char pstring[11], dstring[11];
X
X  if ((aa0=calloc((size_t)MAXAA,sizeof(char)))==NULL) {
X   fprintf(stderr," cannot allocate sequence memory: %d\n", MAXAA);
X    exit(1);
X  }
X
X  if ((nt0=calloc((size_t)3*MAXNT,sizeof(char)))==NULL) {
X   fprintf(stderr," cannot allocate sequence memory: %d\n", 3*MAXNT);
X    exit(1);
X  }
X
X  if ((nt1=calloc((size_t)MAXNT,sizeof(char)))==NULL) {
X   fprintf(stderr," cannot allocate sequence memory: %d\n", MAXNT);
X    exit(1);
X  }
X
X  aainit();
X
X  if (argc < 3) {
X    fprintf(stderr,"enter aligned protein file name:");
X    fgets(pfname,sizeof(pfname),stdin);
X    if ((bp=strchr(pfname,'\n'))!=NULL) *bp='\0';
X
X    fprintf(stderr,"enter cDNA file name:");
X    fgets(dfname,sizeof(dfname),stdin);
X    if ((bp=strchr(dfname,'\n'))!=NULL) *bp='\0';
X  }
X  else {
X    strncpy(pfname,argv[1],sizeof(pfname));
X    strncpy(dfname,argv[2],sizeof(dfname));
X  }
X
X  fprintf(stderr," constructing DNA sequence alignment from %s using %s\n",
X	  pfname,dfname);
X
X  if ((pfile=fopen(pfname,"r"))==NULL) {
X    fprintf(stderr," cannot open %s\n",pfname);
X    exit(1);
X  }
X
X  if ((dfile=fopen(dfname,"r"))==NULL) {
X    fprintf(stderr," cannot open %s\n",dfname);
X    exit(1);
X  }
X
X  while ((na=getlib(aa0,MAXAA,pstring,aascii,pfile))>0) {
X
X    if ((nn=getlib(nt0,3*MAXNT,dstring,nascii,dfile))<= 0) {
X      fprintf(stderr,"cannot read %s from %s\n",pstring,dfname);
X      exit(1);
X    }
X
X    if (strncmp(pstring,dstring,10)!=0) 
X      fprintf(stderr," warning: %s != %s\n",pstring,dstring);
X
X    if ((no = makealign(aa0,na,nt0,nn,nt1,MAXNT))<=0) {
X      fprintf(stderr," could not translate %s from %s\n",pstring,dstring);
X      continue;
X    }
X
X    fprintf(stdout,">%s from %s\n",pstring,dstring);
X    for (n=0; n<no; n++) {
X      fputc(nt1[n],stdout);
X      if (n%60==59) fputc('\n',stdout);
X    }
X    if (no%60 != 0) fputc('\n',stdout);
X  }
X}
X
Xint getlib(seq,maxseq,desc,map,fd)
X     char *seq;
X     int maxseq;
X     char *desc;
X     int *map;
X     FILE *fd;
X{
X  char dline[512], *bp, *sptr;
X  int ch;
X
X  if (feof(fd)) return -1;
X  while(fgets(dline,sizeof(dline),fd))
X    if (dline[0] == '>') break;
X  strncpy(desc,dline,9);
X  desc[10]='\0';
X
X  if ((bp=strchr(desc,'\n'))!=NULL) *bp='\0';
X  if ((bp=strchr(desc,' '))!=NULL) *bp='\0';
X
X  sptr=seq;
X  while ((ch=fgetc(fd))!=EOF && ch!='>' && (int)(sptr-seq)<maxseq) {
X    if (ch=='-') *sptr++ = '-';
X    else if ((ch=map[ch])<NA) *sptr++ = ch;
X  }
X 
X  ungetc(ch,fd);
X
X  return (int)(sptr-seq);
X}
X
Xint makealign(aa,na,nti,nn,out,maxout)
X     char *aa, *nti, *out;
X    



More information about the Bio-soft mailing list