Genbank search by mail from unix

Thon de Boer deboer at bio.vu.nl
Mon Sep 9 10:18:03 EST 1991


Don Gilbert writes:
> 
> Does anyone have a Unix shell script for sending search requests to
> the search servers at genbank.bio.net? It would be nice if it
> did things like convert sequence to proper format (maybe w/
> readseq), decide if it is nucleic or protein and prompt user
> for proper set of search libraries.
> 
> Thanks,  Don 
> -- 
> Don Gilbert                                     gilbert at bio.indiana.edu
> biocomputing office, biology dept., indiana univ., bloomington, in 47405
> 
> 

I made a small shell script which could read in a sequence file (DNA strider,
FASTA or simple format) and could decide if it is DNA or PROTEIN.
It then present you with a choice of which database(s) to search.

There are two versions of the program.
One uses the FASTA mailserver and is called MAILFASTA,
the other one uses the BLAST mailserver and is called.........
MAILBLAST :-)
The two programs plus some extra's can be found in the attached 
shell script. Which also contains a C program which will
be complied by the shell script.

Remove everything above the line and save the shell script.
Then type "sh < shell_script_name"
and voila you have the files.
The shell script contains the following files :
mailfasta : The FASTA mailserver shellscript
mailblast : The BLAST mailserver shellscript
getentry  : A entry retrieval shell script
cid(.c)   : A C program (source) which determines if the file is DNA or AA

It is not programmed very nice and uses the /tmp directory to store some
things but I hope you can use this.

Thon de Boer

------ Remove everything above this line ----------
#! /bin/sh
# This is a shell archive.  Remove anything before this line, then unpack
# it by saving it into a file and typing "sh file".  To overwrite existing
# files, type "sh file -c".  You can also feed this as standard input via
# unshar, or by typing "sh <file", e.g..  If this archive is complete, you
# will see the following message at the end:
#		"End of shell archive."
# Contents:  mailfasta cid.c getentry mailblast
# Wrapped by deboer at bio.vu.nl on Mon Sep  9 16:56:13 1991
PATH=/bin:/usr/bin:/usr/ucb ; export PATH
if test -f 'mailfasta' -a "${1}" != "-c" ; then 
  echo shar: Will not clobber existing file \"'mailfasta'\"
else
echo shar: Extracting \"'mailfasta'\" \(7344 characters\)
sed "s/^X//" >'mailfasta' <<'END_OF_FILE'
X#!/bin/csh
X# Change the MAIL variables below, if your routing is different.
X# These settings are for sites connected to the InterNet.
Xset MAIL_GENBANK = SEARCH at GENBANK.BIO.NET
Xset MAIL_EMBL = FASTA at EMBL.BITNET
Xecho ' '
Xecho ' '
Xecho '                                 MailFasta'
Xecho '                                 *********'
Xecho By Thon de Boer
Xecho Department of Microbiological Physiology
Xecho Vrije universiteit AMSTERDAM  Holland.
Xecho This program is in the Public Domain
Xecho 'Send comments to deboer at bio.vu.nl (email).'
Xecho 'Version 1.2     30 July 1990'
Xecho ' '
Xecho Use \'mailfasta -q\' to get the queue from the fasta mail server
Xecho " or 'mailfasta [sequencefile1 sequencefile2 ..]'"
Xecho ' '
Xecho This program will read a sequence file and mail it to
Xecho 'GenBank (or EMBL) were it will be scanned against the sequence databases'
Xecho 'using the FASTA program'
Xecho ' '
Xecho ' '
Xset stop = false
Xwhile ($stop == false)
Xif ($1 == '') then
X  set argument = empty
Xelse
X  set argument = $1
Xendif
Xswitch ($argument)
Xcase -q:
X        echo QUEUE | mail $MAIL_GENBANK
X        echo Queue command send
X        exit
X        breaksw
Xcase -*:
X        echo mailfasta: Unknow flag \'$argument\'
X        echo Use \'mailfasta -q\' to get the queue from the fasta mail server
X        echo " or 'mailfasta [sequencefile1 sequencefile2 ..]'"
X        exit
X        breaksw
Xendsw
Xif ($argument == empty) then
Xset good=false
Xwhile ($good == false)
Xecho Enter the filename wich contains the sequence
Xset file=$<
Xif ($file != '') then
X  if !(-f $file) then
X    echo mailfasta: File \'$file\' does not exist
X  else
X    echo ' '
X    more $file
X    echo ' '
X    echo 'Is this the right file ? ([yes]/no)'
X    set choice = $<
X    switch ($choice)
X    case n*:
X         breaksw
X    default:
X         set good=true
X         breaksw
X    endsw
X  endif
Xendif
Xend
Xelse if !(-f $argument) then
X  echo mailfasta: File \'$argument\' does not exist
X  exit
Xelse
X  set file = $argument
Xendif
Xif {(cid < $file)} then
X      set type = DNA
X    else
X      set type = PROTEIN
X    endif
Xecho ' '
Xecho Now using $type file: $file
Xecho '*********'
Xgrep '>' $file
Xgrep ';' $file
Xecho ' '
Xendif
Xset db1 = (genbank/all genbank/new genpept/all genpept/new embl/all embl/new swiss-prot/all nbrf genbank/primate genbank/rodent genbank/other_mammalian genbank/other_vertebrate genbank/invertebrate genbank/plant genbank/organelle genbank/bacterial)
Xset db2 = (genbank/structural_rna genbank/viral genbank/phage genbank/synthetic genbank/unannotated)
Xset db = ($db1 $db2)
Xecho 'Which database(s) do you want to search ?'
Xecho ' '
Xif ($type == DNA) echo '    1   All GenBank sequences (including new seq since latest release)'
Xif ($type == DNA) echo '    2   The new GenBank entries'
Xif ($type == PROTEIN) echo '    3   All translated protein reading frames from GenBank'
Xif ($type == PROTEIN) echo '    4   The new entries of translated protein reading frames'
Xif ($type == DNA) echo '    5   All EMBL sequences (including the new sequences)'
Xif ($type == DNA) echo '    6   The new EMBL entries'
Xif ($type == PROTEIN) echo '    7   All SWISS-PROT protein entries'
Xif ($type == PROTEIN) echo '    8   All NBRF/PIR protien entries (results return very slow) '
Xecho ' '
Xif ($type == DNA) then
Xecho ' GenBank subdivisions (for faster searching)'
Xecho '    9   The primate sequences            16  The bacterial sequences'
Xecho '    10  The rodent sequences             17  The structural RNA sequences'
Xecho '    11  The other mammalian sequences    18  The viral sequences'
Xecho '    12  The other vertebrate seq         19  The phage sequences'
Xecho '    13  The invertebrate sequences       20  The synthetic sequences'
Xecho '    14  The plant sequences              21  The unannotated sequences'
Xecho '    15  The organelle sequences'
Xecho ' '
Xendif
Xset good=false
Xwhile ($good == false)
X  echo 'Enter the number(s) of your choice (separated by a <SPACE>)'
X  set choice=$<
X  foreach i ($choice)
X  if ($type == DNA) then
X    switch ($i)
X    case 1:
X            set good = true
X            breaksw
X    case 2:
X            set good = true
X            breaksw
X    case 5:
X            set good = true
X            breaksw
X    case 6
X            set good = true
X            breaksw
X    case 9:
X            set good = true
X            breaksw
X    case 10:
X            set good = true
X            breaksw
X    case 11:
X            set good = true
X            breaksw
X    case 12:
X            set good = true
X            breaksw
X    case 13:
X            set good = true
X            breaksw
X    case 14:
X            set good = true
X            breaksw
X    case 15:
X            set good = true
X            breaksw
X    case 16:
X            set good = true
X            breaksw
X    case 17:
X            set good = true
X            breaksw
X    case 18:
X            set good = true
X            breaksw
X    case 19:
X            set good = true
X            breaksw
X    case 20:
X            set good = true
X            breaksw
X    case 21:
X            set good = true
X            breaksw
X    default:
X         echo mailfasta: Invallid choice \'$i\'
X         set good = false
X         breaksw
X    endsw
X  else
X   switch ($i)
X   case 3:
X           set good = true 
X           breaksw
X   case 4:
X           set good = true 
X           breaksw
X   case 7:
X           set good = true 
X           breaksw
X   case 8:
X           set good = true
X           breaksw
X   default:
X         echo mailfasta: Invallid choice \'$i\'
X         set good = false
X         breaksw
X   endsw
X  endif
Xend
Xend
Xset good=false
Xwhile ($good == false)
X  echo ' '
X  echo 'Enter the sensitivity (lower number means more sensitive)'
X  if ($type == DNA) then
X    echo '  (3..6) [4]'
X  else
X    echo '  (1..2) [1]'
X  endif
X  set ktup=$<
X  if ($ktup != '') then
X    if ($type == DNA) then
X      switch ($ktup)
X      case [3-6]:
X         set good=true
X         breaksw
X      default:
X         echo mailfasta: Invalid choice \'$ktup\'
X         breaksw
X    endsw
X    else
X      switch ($ktup)
X      case [1-2]:
X         set good=true
X         breaksw
X      default:
X         echo mailfasta: Invalid choice \'$ktup\'
X         breaksw
X      endsw
X    endif
Xelse
X    set good=true
X  endif
Xend
Xecho ' '
Xecho 'Enter the maximum number of matched sequences [100]'
Xset scores=$<
Xecho ' '
Xecho 'Enter the maximum number of alignments [20]'
Xset align=$<
Xgrep ">" $file > /tmp/mf2$$
Xset grp = `cat /tmp/mf2$$`
Xforeach i ($choice)
X  if ($i != 8) then
X   echo DATALIB $db[$i] > /tmp/mf$$
X   if ($ktup != '') echo KTUP $ktup >> /tmp/mf$$
X   if ($scores != '') echo SCORES $scores >> /tmp/mf$$
X   if ($align != '') echo ALIGNMENTS $align >> /tmp/mf$$
X   echo BEGIN >> /tmp/mf$$
X   if (-z /tmp/mf2$$) echo ">"$file $type file >> /tmp/mf$$
X   grep -v \; $file >> /tmp/mf$$
X   mail $MAIL_GENBANK < /tmp/mf$$
X  else
X   echo LIB nbrf > /tmp/mf$$
X   if ($ktup != '') echo WORD $ktup >> /tmp/mf$$
X   if ($scores != '') echo LIST $scores >> /tmp/mf$$
X   if ($align != '') echo ALIGN $align >> /tmp/mf$$
X   if !(-z /tmp/mf2$$) then
X    echo TITLE $grp >> /tmp/mf$$
X    echo SEQ >> /tmp/mf$$
X    grep -v ">" $file >> /tmp/mf$$
X   else
X    echo TITLE $file $type file >> /tmp/mf$$
X    echo SEQ >> /tmp/mf$$
X    grep -v \; $file >> /tmp/mf$$
X   endif
X  mail $MAIL_E



More information about the Bio-soft mailing list