hash function for 64 codons

Andrew Rambaut andrew.rambaut at zoo.ox.ac.uk_remove_this_
Mon Jun 18 03:55:33 EST 2001


in article 50f9e4f.0106130946.29302695 at posting.google.com, Bin Li at
bli at ualberta.ca wrote on 14/6/01 9:58 AM:

> I am new in computational biology. I am writing a simple program to translate
> a 
> DNA sequence to encoded amino acid sequence. I guess there is probably a good
> hash
> function out there that hashes the 64 codons, which I assume is the fastest
> way 
> to do translation. Anybody knows about it?

I use a combination of look-up tables and hashing:

//: This table maps nucleotide characters into state codes (0-4)
// Valid nucleotides are AaCcGgTtUu mapped to 221133000
// Other letters are mapped to 4
const int kNucleotideNumbers[128]={
     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,   // 0-15
     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,   // 16-31
     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,   // 32-47
     4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,   // 48-63
     4, 2, 4, 1, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4,   // 64-79
     4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,   // 80-95
     4, 2, 4, 1, 4, 4, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4,   // 96-111
     4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4    // 112-127
};

enum GeneticCode {
    kNonCoding=-1, 
    kUniversalCode,
    kVertebrateMtCode,
    kYeastCode,
    kMoldProtozoanMtCode,
    kMycoplasmaCode,
    kInvertebrateMtCode,
    kCiliateCode,
    kEchinodermMtCode,
    kEuplotidNucCode,
    kBacterialCode,
    kAltYeastCode,
    kAscidianMtCode,
    kFlatwormMtCode,
    kBlepharismaNucCode,
    numGeneticCodes
};

//: The genetic code tables.
// Nucleotides go T, C, A, G - the order given by the Genbank web site
// With the first codon position most significant (i.e. TTT, TTC, TTA, TTG,
// TCT, etc.).
const char kCodeTable[numGeneticCodes][65]={
    // Universal
    "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Vertebrate Mitochondrial
    "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
    // Yeast
    "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Mold Protozoan Mitochondrial
    "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Mycoplasma
    "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Invertebrate Mitochondrial
    "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
    // Ciliate
    "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Echinoderm Mitochondrial
    "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
    // Euplotid Nuclear
    "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Bacterial
    "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Alternative Yeast
    "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
    // Ascidian Mitochondrial
    "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
    // Flatworm Mitochondrial
    "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
    // Blepharisma Nuclear
    "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG"
};


// This function takes the first, second and third nucleotides in the
// codon and returns the amino acid..
char getCodonNumber(char first, char second, char third)
{
    int codonNumber  = (kNucleotideNumbers[first] << 4) +
                       (kNucleotideNumbers[second] << 2) +
                        kNucleotideNumbers[third];

    return kCodeTable[geneticCode][codonNumber];
}



-- 
===================================================================
  Andrew Rambaut,             EMAIL - andrew.rambaut at zoo.ox.ac.uk
  Zoology Department,           WWW - http://evolve.zoo.ox.ac.uk/
  University of Oxford,         TEL - +44 1865 271261
  South Parks Road, Oxford, UK  FAX - +44 1865 271249
===================================================================





More information about the Bio-soft mailing list