large DNA sequences --> smaller overlapping sequences

Kay Hofmann khofmann at isrec-sun1.unil.ch
Tue Jun 16 08:44:51 EST 1998


> Does anybody know of a program to split up a large DNA-sequence file
> (4 Mb) into smaller files/sequences of 200 kb with 10 kb overlap?

You can use the attached perl script. Calling syntax is

split_seq [-nFragsize -oOverlap] infile 

the result is written to StdOut. Default are chunks of 5000 bp
with a 1000 bp overlap

As input formats, FASTA, GCG and EMBL/Swissprot are recognized

Best regards,

Kay


===========================================================
Kay Hofmann, PhD                    Tel: +49 (221) 950 4814
Bioinformatics Group                FAX: +49 (221) 950 4848
MEMOREC Stoffel GmbH
Stoeckheimer Weg 1
D50829 Koeln/Germany        E-mail: Kay.Hofmann at memorec.com
-------------- next part --------------
#!/usr/local/bin/perl
#
#
# This program splits a big sequence file into smaller parts
# fragment size and overlap can be given on the command line
#
#
# The call syntax is:
#        split_seq [-nFragsize -oOverlap] big_sequence
#
# 
#


$pipemode='';
$outformat='FASTA';
$split=5000;
$overlap=1000;

##################################################################
# get sequence in any format and put it into Seq, SeqID and SeqDE
##################################################################

sub getseq {
    if    ($format eq "FASTA") {
      &getfasta;
      $Seq   = $fastabuffer;
      $SeqID = $FastaID;
      $SeqDE = $FastaDE;
    }
    elsif ($format eq "EMBL")  {
      &getembl;
      $Seq   = $emblbuffer;
      $SeqID = $EmblID;
      $SeqDE = $EmblDE;
    }
    else                       {
      &getgcg;
      $Seq   = $gcgbuffer;
      $SeqID = $gcgid;
      $SeqDE = $gcgde;
    }
  }
  
###################################################################
# read Pearson/FASTA format sequence (not to be called externally) 
###################################################################
  
  sub getfasta {
    $fastabuffer="";
    $FastaID="";
    $FastaDE="";
    $line="";
    until (($fastaline =~ /^\>/) || eof(SEQFILE)) {$fastaline=<SEQFILE>};
    if ($fastaline=~/^\>(\S+)\s(.*)$/) {
      $FastaID=$1;
      $FastaDE=$2;
    }
    while ($FastaID=~/^(.*\|)(.+)$/) { $FastaID=$2}
    until (($line =~ /^\>/) || eof(SEQFILE)) {
      $line=<SEQFILE>;
      if (!($line =~ /^\>/)) {$fastabuffer .= $line}
    }
    if ($line =~ /^\>/) {$fastaline=$line}
    else {$fastaline=""};
    $fastabuffer =~ tr/a-z/A-Z/;
    $fastabuffer =~ s/[^A-Z]//g;
  }

###################################################################
# read EMBL/Swissprot format sequence (not to be called externally) 
###################################################################
    
  sub getembl {
    $emblbuffer="";
    $EmblID="";
    $EmblDE="";
    $line="";
    until (($line =~ /^ID\s/) || eof(SEQFILE)) {$line=<SEQFILE>};
    if ($line=~/^ID\s+(\w+).*$/)          {$EmblID=$1;}
    until (($line =~ /^SQ\s/) || eof(SEQFILE)) {
      $line=<SEQFILE>;
      if ($line =~ /^DE\s+(.*)/) {
        if($EmblDE) {$EmblDE.=" "};
	$EmblDE .= $1
      }
    }
    if ($line =~ /^SQ\s/) {
      until (($line =~ /^\/\//) || eof(SEQFILE)) {
        $line=<SEQFILE>;
        if   (!($line =~ /^\/\//)) {$emblbuffer .= $line}
      }
    }
    $emblbuffer =~ tr/a-z/A-Z/;
    $emblbuffer =~ s/[^A-Z]//g;
  }
    
###################################################################
# read GCG format sequence (not to be called externally) 
###################################################################
  
  sub getgcg {
    $gcgbuffer="";
    $gcgid="";
    $line="";
    until (($line =~ /\.\./) || eof(SEQFILE)) {$line=<SEQFILE>};
    if ($line=~/^(\w*).*\.\./)                {$gcgid=$1;}
    until (eof(SEQFILE)) {
      $line=<SEQFILE>;
      $gcgbuffer .= $line
    }
    $gcgbuffer =~ tr/a-z/A-Z/;
    $gcgbuffer =~ s/[^A-Z]//g;
  }

 
###################################################################
# test command line arguments and open files:
###################################################################

  while ($ARGV[0]=~/^-/) {
    $agm = shift(@ARGV);
    if($agm=~/n(\d+)/) {
      $split=$1
    };
    if($agm=~/o(\d+)/) {
      $overlap=$1
    };
  }

    if($#ARGV!=0)        {
      print "The call syntax is:\n\n";
      print "split_seq [-nFragsize -oOverlap] infile\n\n";
      die "\n";
    }
    unless (-T $ARGV[0]) {die "$ARGV[0] no valid input file\n"};  
    open(SEQFILE,"$ARGV[0]")   || die "can't open $ARGV[1]: $!\n";


###################################################################
# determine sequence file format
###################################################################

    $line=<SEQFILE>;
    if    ($line =~ /^\>/)        {$format="FASTA"}
    elsif ($line =~ /^ID\s/)      {$format="EMBL"}
    else  {
      while (!($line =~ /\.\./) && ($line=<SEQFILE>)) {}
      if ($line=~/\.\./)      {$format="GCG"}
      else {die "Sequence format not recognized\n"}
   }
    close(SEQFILE);
    open(SEQFILE,"$ARGV[0]")   || die "can't reopen $ARGV[0]: $!\n";
  
###################################################################
# the main program
###################################################################

do {  
  &getseq;
  $seqlen = length($Seq);
  $count=1;
  $pos1=1;
  
    while (length($Seq)>$split)  {
      $pos2=$pos1+$split-1;
      print ">$SeqID","_$count   $pos1 to $pos2 $SeqDE\n";
      $part  = substr($Seq,0,$split);
      $Seq   = substr($Seq,$split-$overlap);
      while (length($part) > 60) {
         $line  = substr($part,0,60);
         $part  = substr($part,60);
         print "$line\n";
      }
      print "$part\n";
      $count=$count+1;
      $pos1=$pos1+$split-$overlap;
    }
    $part  = $Seq;
    $pos2=$pos1+length($part)-1;
    print ">$SeqID","_$count   $pos1 to $pos2 $SeqDE\n";
    while (length($part) > 60) {
       $line  = substr($part,0,60);
       $part  = substr($part,60);
       print "$line\n";
    }
    print "$part\n";
}
until eof(SEQFILE);  


More information about the Bio-soft mailing list