Suggestions for SRS use w/ nonsequence gene data

Don Gilbert gilbertd at sunflower.bio.indiana.edu
Thu Aug 17 08:16:14 EST 1995


Here are some suggestions for SRS that have arisen from trying to use
it with Drosophila genome data:

1) indexer interface: needs to permit indexing of any character/symbol set.
   Drosophila genes use just about the full ASCII printable symbol set
   (and would use more if possible).
   
   It would be nice to allow also adding data filter functions to the indexer
   that would convert various computer format data to data suitable for
   indexing -- e.g., convert special codes like '&bgr;-tubulin' into english 
   equivalents 'beta-tubulin' for the indexing.
   
   It would help if data parsing language for indexer was not
   as difficult to write accurately.  If I don't spend a lot of
   time testing a new parsing, I can't be confident that the indexer
   is getting everything it should.  The recent example of missing
   2,000 of 9,000 entries in the flygene data due to the symbol "\" in 
   gene names makes this point.
   
   
2) query interface: needs to permit any character/symbol set to be valid
   data in the query, and query symbols should be configurable. 
    
   Use of words instead of symbols as query operators
   should be optional at least, and by my preference they would be default.
   E.g., a query like this should be possible:  
   
      databank1  fieldA  some/*![-]()=+messy&^%!%@*#string  
      and
      databank2  fieldB  another%^#*&@P)!Q(@string
      but not
      databank3  fieldC  more#*@(#P*#strings
      

3) output interface:  needs to allow addition of post-processor functions
   to convert data to various human-usable formats.  This is done now in
   part for sequence data and for adding html links, but not in a
   general way that would allow addition output formatting per database
   w/o rewriting the basic SRS code.

   Here is roughly how I did it for flybase data, but it is a hack
   not a general solution.  Example outputs show this formatted output 
   from iubio server, versus the computer "star code" output from the 
   sanger server.


lynx 'http://iubio.bio.indiana.edu:81/srs/srsc?[FLYGENE-acc:FBgn0003890]'

Gene symbol                  : betaTub97EF
Last update                  : 11 Jul 95
Synonym(s)                   : beta-Tub97EF
                             : beta1t
                             : beta4t
                             : betaTub4
                             : B4t
FlyBase gene id number       : FBgn0003890
Full name                    : betaTubulin97EF
Genetic map position         : 3-[92]
Cytological map position     :
    Located in 97E-F by in situ hybridization (Natzle and McCarthy,
    1984).
Function(s) of product       : beta-tubulin
                             : tubulin
D. mel. DNA/RNA AC no(s)     : X69560
                             : M20419
Phenotypic information       :
    Tubulins are the main
    structural components of microtubules in mitotic and
    meiotic spindles, cilia, flagella, neural processes
    and the cytoskeleton; nontubulin proteins (MAPS or
    microtubule-associated proteins) are involved along
    with tubulins in the formation of specialized
    microtubules (Theurkauf, Baum, Bo and Wensink, 1986; Rudolph,
    Kimble, Hoyle, Subler and Raff, 1987).
    Tubulin proteins are found in a wide variety of
    species from unicellular organisms to man; their
    biochemical and molecular structure is highly
    conserved. The alpha- and beta-subunits from different
 ...
==============================

lynx 'http://www.sanger.ac.uk/srs/srsc?[FLYGENE-acc:FBgn0003890]'

*a &bgr;Tub97EF
*H Last updated 11 Jul 95
*i &bgr;-Tub97EF
*i &bgr;1t
*i &bgr;4t
*i &bgr;Tub4
*i B4t
*z FBgn0003890
*e &bgr;Tubulin97EF
*b 3-[92]
*c Located in 97E-F by in situ hybridization (Natzle and McCarthy, 1984).
*d &bgr;-tubulin
*d tubulin
*g X69560
*g M20419
*p Tubulins are the main
*p structural components of microtubules in mitotic and
*p meiotic spindles, cilia, flagella, neural processes
*p and the cytoskeleton; nontubulin proteins (MAPS or
*p microtubule-associated proteins) are involved along
*p with tubulins in the formation of specialized
*p microtubules (Theurkauf, Baum, Bo and Wensink, 1986; Rudolph,
*p Kimble, Hoyle, Subler and Raff, 1987).
*p Tubulin proteins are found in a wide variety of
*p species from unicellular organisms to man; their
*p biochemical and molecular structure is highly
*p conserved. The &agr;- and &bgr;-subunits from different
...
==========================



Changes needed to add output formatting function for a given
database, in srs/src/srswww.c -------------


Boolean gDoflyb = FALSE; /* dgg */
enum { kPlainText = 3 };
char* fbcode2report( char* inbuf, short outformat, short state);

/* fbcode2report code is from fbgenereport.c available in 
  portable flybase server source at
  http://flybase.bio.indiana.edu:82/1/work/Portable-server/source/flyreports/
*/
      
INT4 WwwPrintSet (char *setName, int firstN, int printN, SCRIPTo *script)
{
...
  for (k=firstN;  k <= lastN && k <= setEntryN;  k++) {
    SetGetID (set, k, &id); 
    entry = EntryOpen (&id);
    if (entry) { 
      if (ParGetNum ("printLinkTable"))
        WwwPrintEntryLinks (entry, script);
      else {
/* --- add for output formatting - dgg --- */    
        /* dgg hack to test formatting calls */
        gDoflyb= (0== strcmp(id.id_d->nam, "FlybEntry-ID"));
/* ^^^ add for output formatting - dgg ^^^ */    
        WwwEntryPrint (entry, script);
        }
      EntryClose (&entry);
      }
}


static void FlybPrintBuff (char *ln)
{
  char* newbuf= fbcode2report( ln, kPlainText, 1);  
  if (newbuf) {
    strcat (gBuff, newbuf);
    free(newbuf);
    }
}

static void WwwPrintField (ENTRYo *entry, INT4 doPrintAll)
{
  static PRSoST *tokList=NULL;
  LIBoHYPERLINK *hLink;
  INT4           context, (*printSave)(), lineNSave;

#ifdef MAC
  if (gBuff == NULL) gBuff = (char*) malloc( (WWWxMAXLINESIZE+1) * sizeof(char));
#endif
  
  if (!tokList)
    PrsIniSym (&tokList, 50, 500);
  else
    PrsResetSym (tokList);

  lineNSave = entry->file[0]->n;


  if (doPrintAll || LibIsField (entry->field, "active")) {
    if (ParGetNum ("doInsertHyperLinks")) {
      for (context=0; (hLink = LibNextHyperLink (&context));) {
  if (entry->field == hLink->field) {
    gBuff[0] = '\0';  /* reset global buffer */
    printSave = ParGetFunction ("printf");
    ParDefFunction ("printf", (INT4(*)()) WwwPrintBuff);
    
    if (EntryFieldPrint (entry, 1)) {
      entryCurr = entry; /* set global entry */
      PrsString (gBuff, hLink->parser, tokList, hLink->parse, NULL);
      ParDefFunction ("printf", printSave);

/* --- add for output formatting - dgg --- */    
      if (gDoflyb) {
        char* newbuf= fbcode2report( gBuff, kPlainText, 1);  
        if (newbuf) { 
          printf ("%s", newbuf);
          free(newbuf);
          }
        }
      else    
 /* ^^^ add for output formatting - dgg ^^^ */    
        WwwPrintF (gBuff);
       
    }
    else
      FilURead (entry->file[0]);
    return;
  }
      }
    }

/* --- add for output formatting - dgg --- */    
   if (gDoflyb) {
      gBuff[0] = '\0';  
      printSave = ParGetFunction ("printf");
      ParDefFunction ("printf", (INT4(*)()) FlybPrintBuff);
      EntryFieldPrint (entry,1);
      printf ("%s", gBuff); /*  WwwPrintF (gBuff);*/
      ParDefFunction ("printf", printSave);
      }
    else
 /* ^^^ add for output formatting - dgg ^^^ */    
     EntryFieldPrint (entry,1);
 }
  if (lineNSave == entry->file[0]->n)
    FilURead (entry->file[0]);

  return;
}

--
-- d.gilbert--biocomputing--indiana u--bloomington--gilbertd at bio.indiana.edu




More information about the Bio-srs mailing list