Format converter for TFD database to GCG

David Mathog mathog at seqvax.caltech.edu
Fri Apr 23 17:24:00 EST 1993


Appended to this file is a short program that converts David Ghosh's SITES.DAT
file to a GCG TFSITES.DAT file.  You can get SITES.DAT via FTP from
NCBI.NLM.NIH.GOV.  The program is written in Fortran and works under VMS 5.5-2.
I'm not making any guarantees (or even wagers) that it will work with Fortran
compilers on any other system.  When you run the program it will prompt for
input/output files and for comments to put at the front of the file. 

The record descriptions were valid for TFD 6.3 and GCG 7.2.

Thanks to Dr. Ghosh for describing to me the relationship between ";" and
N_PROB. 

David Mathog
mathog at seqvax.bio.caltech.edu
Manager, sequence analysis facility, biology division, Caltech


+-+-+-+CUT HERE+-+-+-+-+-+-+-+-+-+CUT HERE+-+-+-+-+-+-+-+-+-+CUT HERE+-+-+-
C	TFDTOGCG.FOR
C	9-APR-1993 David Mathog, Division of Biology, Caltech
C
C	This little program takes one of David Ghosh's site.dat
C	files and reformats it for GCG usage.
C
C	There is practically NO error checking, so watch out!
C
C	Copyright 1993, Caltech
C
C	No warrantees or guarantees.
C
	implicit none
	character*2048 inline,outline,infile,outfile
	integer*4 inlen,istat
	integer*4 recsize
	logical   ok
c
c  D. Ghosh lays out site records like this
c
	structure /ghosh_record/
	  character SITE_ID*6
	  character FAC_NAME*25
	  character SEQ_NAME*30
	  character NA_SEQ*45
	  character SEQ_TYPE*1
	  character SYSTEM*10
	  character GENOME*1
	  character TRN_UNIT*20
	  character COMMENTS*80
	  character MAIN_REF*60
	  character FAC_SOURCE*16
	  character LOCAT_REF*20
	  character LOCATION*20
	  character METHOD*11
	  character N_PROB*8
	  character REF_N*8
	  character STRAND*1
	  character BINDING*1
	end structure
c
c  GCG lays out TFSITE.DAT records like this
c
	structure /GCG_RECORD/
	  character SEQ_NAME*31
	  character SPACER1*2
	  character NA_SEQ*45
	  character SPACER2*5
	  character FAC_NAME*25
	  character SPACER3*1
	  character MAIN_REF*60
	end structure
c
	record /ghosh_record/ ghosh
	record /gcg_record/   gcg
c
c	Init the spacers for the GCG record
c
	gcg.spacer1 = '0 '
	gcg.spacer2 = ' 0 ! '
	gcg.spacer3 = ' '
c
	write(6,*)'TFDtoGCG'
	write(6,*)'This program converts one of David Ghosh''s site'
	write(6,*)'  files to GCG''s format'
c
	write(6,*)'Input the name of the file to process'
	read(5,'(q,a)')inlen,infile(1:inlen)
c
	open(unit=10,file=infile(1:inlen)
	1 ,form='UNFORMATTED',organization='SEQUENTIAL',status='OLD'
	1 ,recordtype='VARIABLE', READONLY)
c
	write(6,*)'Input the name of the output file'
	read(5,'(q,a)')inlen,outfile(1:inlen)
c
	open(unit=11,file=outfile(1:inlen)
	1 ,form='UNFORMATTED',status='NEW',organization='SEQUENTIAL'
	2 ,recordtype='STREAM_LF',recl = 255)
c
c	get the comments
c
	write(6,*)'Enter as many lines of comments as you would like'
	write(6,*)'  End each line with a <return>'
	write(6,*)'  End the last line with <return><return>'
	ok = .true.
	do while (ok)
	   read(5,1000)inlen,inline(1:inlen)
	   if(inlen.eq.0)then
	       ok = .false.
	   else
	       write(11)' '//inline(1:inlen)
	   end if
	end do
	write(6,*)'Working ...'
c
c	write a title line, this one is *easy*
c
	GCG.SEQ_NAME = 'NAME'
	GCG.FAC_NAME = 'FACTOR'
	GCG.NA_SEQ   = 'SEQUENCE'
	GCG.MAIN_REF = 'REFERENCE'
	write(11)gcg
c
c	Now write the divider
c
	write(11)'..'	
c
	istat=0
	do while(istat.ge.0)
	    read(10,iostat=istat)ghosh
1000	    format(q,a)
	    if(istat.ge.0)then
	       GCG.SEQ_NAME = GHOSH.SEQ_NAME//' '
	       GCG.FAC_NAME = GHOSH.FAC_NAME
	       GCG.NA_SEQ   = GHOSH.NA_SEQ
	       GCG.MAIN_REF = GHOSH.MAIN_REF
	       call fixseqname(GCG.SEQ_NAME,GHOSH.N_PROB)
	       write(11)gcg
	    end if
	end do
1100	format(a)
	close(unit=10)
	close(unit=11)
	stop 'TFDtoGCG: normal completion' 
	end

	subroutine fixseqname(NAME,N_PROB)
	character name*(*)
	character N_prob*(*)
	integer i,last,inlen,nlen
	real limit,value
	parameter (limit = 5.0e-4)
c
c	Do the length this way so that it will still work if the
c	length of NAME, N_PROB change.
c
	inlen=len(NAME)
	nlen =len(N_PROB)
c
c	first put in a ";", if needed to indicate a frequent motif
c
	read(N_prob,1000)value
1000	format(F<NLEN>.2)
	if(value.gt.limit)NAME = ';'//NAME(1:inlen-1)
c
c	now convert any internal spaces to underscores
c	if it doesn't find *any* nonspaces, the name becomes "UNKNOWN"
c	This is done in two passes.
c
	do i = 1, inlen
	  if(name(i:i).eq.' ')name(i:i)='_'
	end do
c
	last=inlen+1
	i = inlen
	do while(last.eq.inlen+1 .and. i.gt.0)
	  if(name(i:i).ne.'_')then
	     last = i
	  else
	     name(i:i) = ' '
	     i = i-1
	  end if
	end do
c
	if(last.eq.inlen+1)name='UNKNOWN'
c
	return
	end




More information about the Bio-soft mailing list