question about SRS5.0

Tao Jiang jiangt at pku.edu.cn
Wed Dec 11 20:54:42 EST 1996


I am not sure if I am missing something here,
but I always have problems with SRS5.0.

I want to test with only phg.dat of EMBL,
which is a very small data file. With SRS4.08,
indexing takes only 1 min., while with SRS5.0,
it will take too long for me to wait it ends.
I modified the file icarus.c, embl.i and embl.is just
as Thure suggested and compiled icarus.c and run 'srssection'
and 'srsupdate', but it still blocks (maybe it will ends
after a very long time?).

I attach my embl.i, embl.is at the end, and any suggestions
are greatly appreciated.

-- 
Tao Jiang            Fax: 86-10-62751982
Network Center       Homepage: http://jiangt.pku.edu.cn/~jiangt/
Peking University    Email: jiangt at pku.edu.cn
-------------- next part --------------
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#
#    $RCSfile: embl.i,v $
#    $Revision: 1.11 $
#    $Date: 1996/12/06 22:17:37 $
#    $Author: etzold $
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

#EMBLNEW_DB:$library:[EMBLNEW group:@SEQUENCE_LIBS 
#  format:@EMBL_FORMAT cachesize:2048 maxNameLen:15 
#  subentries:@EmblnewFeatures_DB
#  files:$file:emnew
#]

EMBL_DB:$library:[EMBL group:@SEQUENCE_LIBS 
  partSize:1000000
  comment:"  Nucleotide Sequence Database from EBML"
  subentries:@EmblFeatures_DB
  format:@EMBL_FORMAT 
  cachesize:2048 
  maxNameLen:30 files:{
#    $file:est1
#    $file:est2
#    $file:est3
#    $file:est4
#    $file:est5
#    $file:est6
#    $file:est7
#    $file:fun
#    $file:gss
#    $file:hum1
#    $file:hum2
#    $file:inv
#    $file:mam
#    $file:org
#    $file:patent
#    $file:phg
#    $file:pln
#    $file:pro
#    $file:rod
#    $file:sts
#    $file:syn
#    $file:unc
#    $file:vrl
#    $file:vrt 


#    $file:em_est1
#    $file:em_est2
#    $file:em_est3
#    $file:em_est4
#    $file:em_est5
#    $file:em_est6
#    $file:em_est7
#    $file:em_ba
#    $file:em_fun
#    $file:em_gss
#    $file:em_hum1
#    $file:em_hum2
#    $file:em_in
#    $file:em_om
#    $file:em_or
#    $file:em_ov
#    $file:em_pat
    $file:em_ph
#    $file:em_pl
#    $file:em_pr
#    $file:em_ro
#    $file:em_sts
#    $file:em_sy
#    $file:em_un
#    $file:em_vi
  }
]

EMBL_FORMAT:$libformat:[syntax:@EMBL_SYNTAX 
  fileType:{@DAT_FILE @SEQ_FILE} #orig format
#  fileType:{@GCGREF_FILE @GCGSEQ_FILE} #GCG format
  fields:{
    $field:[@DF_ID code:id index:id indexToken:id]
    $field:@DF_ALL
    $field:[@DF_Accession code:acc index:str indexToken:acc 
      tableToken:acc tableFormat:left]
    $field:[@DF_Division code:id index:str indexToken:div
      tableToken:div tableFormat:left]
    $field:[@DF_Molecule code:id index:str indexToken:mol
      tableToken:mol tableFormat:left]
#    $field:[@DF_DBOrigin code:acc index:str indexToken:dbOri
#      tableToken:dbOri tableFormat:left]
#    $field:[@DF_AccessionKey code:acc index:str indexToken:accKey
#      tableToken:accKey tableFormat:center]
    $field:[@DF_Description code:des index:str indexToken:des
      tableToken:t_des tableFormat:left]
    $field:[@DF_Keywords code:key index:str indexToken:key
      tableToken:key tableFormat:left]
    $field:[@DF_Organism code:org index:str indexToken:org]
    $field:[@DF_Authors code:ra index:str indexToken:authors
      tableToken:authors tableFormat:left]
    $field:[@DF_Date code:date index:int indexToken:date 
      tableToken:date tableFormat:center]
    $field:[@DF_SeqLength code:sq  index:int indexToken:seqLen 
      tableToken:seqLen tableFormat:right]
    $field:[@DF_LINK code:dr]
#    $field:[@DF_DNASequence token:gcgseq format:embl] #GCG format
    $field:[@DF_DNASequence token:sequence format:embl] #orig format

    $field:[@DF_HeaderField name:'Feature Table Fields']
    $field:[@DF_FtKey code:ft index:str indexToken:ftKey 
      indexId:@SUBENTRY_ID tableToken:ftKey tableFormat:left]
    $field:[@DF_FtQualifier code:ft index:str indexToken:ftQual
      indexId:@SUBENTRY_ID]
    $field:[@DF_PID code:ft index:str indexToken:pid
      indexId:@SUBENTRY_ID]
    $field:[@DF_FtDescription code:ft index:str indexToken:ftDes 
      indexId:@SUBENTRY_ID]
    $field:[@DF_FtSource code:ft index:str indexToken:ftSrc
      indexId:@SUBENTRY_ID]
    $field:[@DF_ChrsNo code:ft index:str indexToken:chrsNo 
      indexId:@SUBENTRY_ID]
    $field:[@DF_FtMap code:ft index:str indexToken:map
      indexId:@SUBENTRY_ID]
  }
]


EMBL_SYNTAX:$syntax:[file:"SRSDB:embl.is" ignore:" \t"]
$syntax:[name:ftseq file:"SRSDB:ftseq.is" ignore:" \t\n"]

#$link:[@EMBL_DB to:@?GENBANK_DB   toField:@DF_ACCNO]
#$link:[@EMBL_DB to:@?PIR_DB       toField:@DF_ACCNO]
#$link:[@EMBL_DB to:@?REBASE_DB    toField:@DF_ID]
#$link:[@EMBL_DB to:@?OMIM_DB      toField:@DF_ID]
#$link:[@EMBL_DB to:@?MEDLINE_DB   toField:@DF_ID]


EmblFeatures_DB:$library:[EMBL_features format:@EmblFeature_Format]
EmblnewFeatures_DB:$library:[EMBLNEW_features format:@EmblFeature_Format]

EmblFeature_Format:$libformat:[syntax:@EMBL_SYNTAX  #idType:@SUBENTRY_ID
  tableFormat:left
  fields:{
    $field:[@DF_ID token:ftId]
#    $field:[@DF_Accession code:acc fromParent:y]
#    $field:[@DF_Description code:des fromParent:y]

    $field:[@DF_FtKey token:ft tableToken:'t_ft|key']
    $field:[@DF_FtQualifier token:ft tableToken:'t_ft|qual']
    $field:[@DF_PID token:ft tableToken:'t_ft|db_xref']
    $field:[@DF_FtDescription token:ft tableToken:'t_ft|note']
    $field:[@DF_FtGene token:ft tableToken:'t_ft|gene']
    $field:[@DF_FtProduct token:ft tableToken:'t_ft|product']
    $field:[@DF_FtPartial token:ft tableToken:'t_ft|partial']
    $field:[@DF_FtPseudo token:ft tableToken:'t_ft|pseudo']
    $field:[@DF_FtNumber token:ft tableToken:'t_ft|number']
    $field:[@DF_FtSource token:ft tableToken:ftSrc]
    $field:[@DF_ChrsNo token:ft tableToken:'t_ft|chromosome'] 
    $field:[@DF_FtMap token:ft tableToken:'t_ft|map']
    $field:[@DF_DNALocation token:ftLocat tableFormat:listing]
  }
]

DF_FtGene:$srsfield:[Gene short:gen]
DF_FtProduct:$srsfield:[Product short:pro]
DF_FtPartial:$srsfield:[Partial short:par]
DF_FtPseudo:$srsfield:[Pseudo short:pse]
DF_FtNumber:$srsfield:[Number short:num]
-------------- next part --------------
#!/bin/env icarus
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#    $RCSfile: embl.is,v $
#    $Revision: 1.10 $
#    $Date: 1996/12/06 22:17:38 $
#    $Author: etzold $
#
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

$rules={
  entry:     ~ {$In:[file:text] $Out  pre $Skip:0}
               ('ID  ' {$not} ln)*
               ('ID   '{$entryFip=$Fip $Wrt} ln {$App} 
                (/(  |ID|>>)/ {$Not} ln {$App})+ )? ~


  # data fields
  fields:    ~ {$In:entry $Out $Skip:1}
               id sep acc sep nid sep date sep f_des sep f_key sep 
               (src tax? sep f_org? sep)+ (rn rc? rp? rx? ra rt? rl? sep)* 
               link? sep (cmnt sep)* (fh f_ft*)? sep sq  
             ~
  sep:       ~ {$Wrt:sep} ('XX' ln)* ~ 
  id:        ~ {$Wrt:id}       'ID' / *([A-Z0-9_]+)/
               {$entryName=$1} ln ~
  acc:       ~ {$Wrt:acc}      ('AC' ln)+ ~
  nid:       ~ {$Wrt:nid}      ('NI' ln)* ~
  date:      ~ {$Wrt:date}     ('DT' ln)+ ~
  f_des:     ~ {$Wrt:des}      ('DE' ln)+ ~
  src:       ~ {$Wrt:org}      ('OS' ln)+ ~  
  tax:       ~ {$Wrt:org}      ('OC' ln)+ ~  
  f_org:     ~ {$Wrt:org}      ('OG' ln)+ ~  
  rn:        ~ {$Wrt:rn}       ('RN' ln)+ ~
  rc:        ~ {$Wrt:rc}       ('RC' ln)+ ~
  rp:        ~ {$Wrt:rp}       ('RP' ln)+ ~
  rx:        ~ {$Wrt:rx}       ('RX' ln)+ ~
  ra:        ~ {$Wrt:ra}       ('RA' ln)+ ~
  rt:        ~ {$Wrt:rt}       ('RT' ln)+ ~
  rl:        ~ {$Wrt:rl}       ('RL' ln)+ ~
  cmnt:      ~ {$Wrt:cmnt}     ('CC' ln)+ ~
  link:      ~ {$Wrt:link}     ('DR' ln)+ ~
  f_key:     ~ {$Wrt:key}      ('KW' ln)* ~
  fh:        ~ {$Wrt:fh}       ('FH' ln)+ ~
  f_ft:      ~ {$Wrt:ft}       'FT' ln ('FT     ' ln)* ~
  sq:        ~ {$Wrt:sq}       'SQ' ln ~


  # parsing the sequence part from separate stream
  gcgseq:    ~ { $In:[file:seq] $Out pre $s=$SeqNew
                 $SeqMake:$s $Wrt:[s:$entryName]
               }
               '>>>>' {$seqFip=$Fip} (/[A-Z0-9]+/ seq |  
               /[A-Z0-9]+_0/ seq (/>>>>[A-Z0-9]+_[1-9]+/ 
               {$SeqTrunc:[$s len:10000]} seq)+) ~ 
  seq:       ~ (/.*2BIT *Len:/ /[0-9]+/ {$len=$Ct} ln ln 
	       {$SeqGet2Bit:[$s file:$File len:$len]} ('>>>>'{$Not} ln)*|
               /.*ASCII/ ln ln ('>>>>' {$Not} /.*/ {$SeqApp:[$s s:$Ct]})+) ~

  sequence:  ~ {$In:[file:seq share:text] $Out pre {$s=$SeqNew $seqFip=$Fip} 
                $Wrt:[s:$entryName] $SeqMake:$s} 
               ('ID' {$Not} ln {$SeqApp:[$s s:$Ct]})* ~


  # for indexing
  i_id:      ~ {$In:[entry] $Out:id} /ID +/ name {$Wrt} ~
  i_div:     ~ {$In:[fields c:id] $Out:div}
                /ID +/ name  /[^;]+/ ';' /[^;]+/ ';' name {$Wrt} ~
  i_mol:     ~ {$In:[fields c:id] $Out:mol} 
               /ID +/ name  /[^;]+/ ';' /[^;]+/  {$Wrt} ~
  i_acc:     ~ {$In:[fields c:acc] $Out:acc} ('AC' (name {$Wrt} ';')+)+  ~
  i_nid:     ~ {$In:[fields c:nid] $Out:nid} 'NI' name {$Wrt}  ~
  i_dates:   ~ { $In:[fields c:date] $Out:date 
                 init $month={JAN:1 FEB:2 MAR:3 APR:4 MAY:5 JUN:6 JUL:7 AUG:8 
                              SEP:9 OCT:10 NOV:11 DEC:12}
               }
               /.* ([0-9]+)-([A-Z]+)-([0-9]+)[^\n]+Cre/ 
               {$Wrt:[credate s:($1 + $month.$2*100 + $3*10000)]} ~
  des:       ~ {$In:[fields c:des] $Out} 
               ('DE' (/\\(EC *([0-9]+\\.[0-9]+\\.[0-9]+\\.[0-9]+)/
               {$Wrt:[s:$1]}| word{$Wrt} | sp)+ ln)+ ~ 
  org:       ~ {$In:[fields c:org] $Out} 
               ('OC' (/[^;.\n]+/ {$Wrt} | /[^\n]/)*)+ |
                'OS' /[^(\n]+/ {$Wrt:[s:$Trim:$Ct]} 
                ('(' (/[^ \n)]+/ | /[^)]/)+)? |
                'OG'  /[^\n;.]+/ {$Wrt} ~
  key:       ~ {$In:[fields c:key] $Out} 
               ('KW' (/[^\n;.]+/ {$Wrt} | /[^\n]/)+)+ ~ 
  i_authors: ~ {$In:[fields c:ra] $Out:authors}
               (/RA/ (/([^.,\n]+) +([^,;\n]+)/ {$Uniq:[s:"$1,$2"]} /[,;]/)*
               ln)* ~
  i_accKey:  ~ {$In:[fields c:acc] $Out:accKey} 'AC' /[A-Z]+/ {$Wrt} ~
  i_dbOri:   ~ {$In:[fields c:acc] $Out:dbOri
                 init { $dbOriN={
                         A:1 F:1 V:1 X:1 Y:1 Z:1
                         AA:2 AC:2 AD:2 B:2 G:2 H:2 I:2 J:2 K:2 
                         L:2 M:2 N:2 R:2 S:2 T:2 U:2 W:2 
                         AB:3 C:3 D:3 E:3}
                       $dbName={1:EMBL 2:GenBank 3:DDBJ}
                 }
               } 
               'AC' /[A-Z]+/ {$Wrt:[s:$dbName.$dbOriN.$Ct]} ~
  i_seqLen:  ~ {$In:[fields c:sq] $Out:seqLen}
               'SQ   Sequence' /[0-9]+/ {$Wrt} ~

  # indexing features
  ftWord:    ~ /[^" ,;:()\n.-]+/ ~ #"
  ftSep:     ~ /[ ,;.:()-]+/ ~
  ftKey:     ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
               'FT' /[^ ]+/ {$Wrt:[n:$ftN]} ~
  ftQual:    ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
               /[^\/]+/ 
               (/\/([a-zA-Z0-9_]+)/ {$Wrt:[s:$1 n:$ftN]} (/=[a-zA-Z0-9_]+/ |
               /="[^"]+"/)? /[^\/]+/)* ~ #"
  chrsNo:    ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
               /.+\/chromosome="?/ 
               (/[^\n\" ]+/ {$Uniq:[n:$ftN n:0]} | '\nFT' | ' ')+ ~ #"
  ftSrc:     ~ {$In:[fields c:ft count:ft var:$ftN] $Out}
   (/[^\/]+\/(tissue_type|cell_line|organism|strain|dev_stage|sex|clone_lib)="/ 
                ('\nFT' | ftWord {$Uniq:[n:$ftN]} | ftSep)+ | '/')* ~
  ftDes:     ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
               (/[^\/]+\/(product|note|gene)="/ 
                ('\nFT' | /NCBI gi: *[0-9]+/ | 
                 ftWord {$Uniq:[n:$ftN]} | ftSep)+ | '/')* ~
  map:       ~ {$In:[fields c:ft count:ft var:$ftN] $Out}
               /.+\/map="/ (/[^a-zA-Z0-9"]+/|/[a-zA-Z0-9]+/{$Wrt:[n:$ftN]})+ #"
             ~
  pid:       ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
               (/db_xref="?PID:([0-9a-zA-Z]+)/ {$Wrt:[n:$ftN s:$1]} |
                /[^\/]+\// )+ ~

  # displaying features
  ftClean:   ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out } 
               'FT' ln {$Wrt} ('FT' ln {$App})* ~
  ftLoc:     ~ {$In:ftClean $Out} /[^ ]+/ /[^\/]+/ {$Wrt} ~
  ft:        ~ {$In:[fields c:ft count:ft var:$ftN select:$subEntryN] $Out}
               /.*/ {$Wrt} ~
  ftId:      ~ {$In:[fields c:id] $Out} 
               /.. *([A-Z0-9]+)/ {$Wrt:[s:"ID   $1\_$subEntryN;  parent: $1"]}~
  h_ftId:    ~ {$In:[ftId t:html]} /.*parent: */ 
               /[A-Z0-9]+/ {$Rep:{$ParStr:emblIdR $Ct $Ct}} ~
  t_ft:      ~ {$In:ftClean $Out} /(<A HREF)?[^ ]+/ {$Wrt:key} 
               /[^\/]+/ t_qual* ~
  t_qual:    ~ /[ \n]*\/([^ \n=]+)/ {$qn=$1 $Uniq:[qual s:$1]} 
               qualval? {$Wrt:[$qn s:$qv]} ~
  qualval:   ~ {pre $qv=""} /=([a-zA-Z0-9_]+)/ {$qv=$1} | 
               /="([^"]+)"/ {$qv=$1} | x{$qv='+'} ~

  # extracting feature sequences"

  ftCleanIter: ~ {$In:[fields c:ft] $Out } 
               'FT' ln {$Wrt} ('FT' ln {$App})* ~
  ftLocat:    ~ {$In:ftClean $Out init {$SdbFunctions}}
               /(<A HREF)?[^ ]+/ /[^\/]+/ {$Wrt} ~
               
  # printing tables
  t_authors: ~ {$In:[fields c:refaut] $Out} 'RA' ln {$Wrt} ('RA' ln {$App})* ~
  t_des:     ~ {$In:[fields c:des] $Out} 'DE' ln {$Wrt} ('DE' ln {$App})* ~ 

  # printing in HTML format
  h_ft:      ~ {$In:[fields c:ft t:html count:ft var:$ftN]} 
               'FT' /[^ ]+/ {$Rep:{$ParStr:emblFeatR $entryName $ftN $Ct}}  
#               /[^\/]+/ 
#               (/\/([a-zA-Z0-9_]+)/ {$Rep:{$ParStr:emblFtQualR $1 $Ct}}
#               (/=[a-zA-Z0-9_]+/ | /="[^"]+"/)? /[^\/]+/)* 
             ~
  h_id:      ~ {$In:[fields c:id t:html]} /../ name {$entryName=$Ct} ~
  h_rx:      ~ {$In:[fields c:rx t:html]}
               /../ 'MEDLINE;' /[0-9]+/ {$Rep:{$ParStr:medlineR $Ct $Ct}} ~
  htmlTag:   ~ /<[^<>]*>/ ~
  h_links:   ~ {$In:[fields c:link t:html] 
                init{$hl={
                       'SWISS-PROT':                    swissR
                       DICTYDB:   dictydbR   GCRDB:     gcrdbR
                       MAIZEDB:   maizedbR   WORMPEP:   wormpepR
                       LISTA:     listaR  
                       PIR:       pirR       YEPD:      yepdR        
                       SGD:        sgdR       
                       STYGENE:   stygeneR     
                       HIV:       hivR         
                       ECOGENE:   ecogeneR   ECO2DBASE: eco2dbaseR   
                       MIM:       mimR       SUBTILIST: subtilistR   
                       FLYBASE:   flybaseR     
                       TRANSFAC:  transfacR  REBASE:    rebaseR
                     } 
                    } 
               }
               (/DR/ dbName {$db=$Ct} ';' 
                accno {$Rep:{$ParStr:$hl.$db $Ct $Ct}} ln)*
             ~
  # other stuff

  word:      ~ /[^" ,;:()\/=\n.-]+/ ~
  sp:        ~ /[ ,;.:()\/=-]+/ ~
  lnCode:    ~ /\n[A-Z][A-Z]/ ~
  ln:        ~ /[^\n]*\n/ ~
  accno:     ~ /[A-Z0-9]+/ ~
  num:       ~ /(<|>)?[0-9?]+/ ~
  name:      ~ /[a-zA-Z0-9_-]+/ ~ 
  dbName:    ~ /[^;]+/ ~ 
}

if:$TestMode {
  $job = $JobNew:[prod:$rules skip:" " fileName:'/data/embl_dna/mam.dat']
  while:$JobHasInput:$job {
    $JobTokens:[$job name:acc print:0] 
    $JobTokens:[$job name:t_ft print:1]
    $JobNext:$job
    #$print:"-------->entry\n"
  }
}




More information about the Bio-srs mailing list