Microsatellite (repeat) searching

Mary Pat Reeve mpreeve at genome.wi.mit.edu
Thu Mar 10 14:06:23 EST 1994


Here is a basic piece of C code that will find all permutations of
microsatellite repeats.  The basic idea is that it moves a window over
the sequence and checks if the next window agrees with the previous
one.  REPEAT_THRESHOLD is the number of windows that must be the same
before the repeat is marked; i.e., CACACACACACACACACACA would qualify
as 10.  Once the repeat is marked, that region is marked so that it is
not searched for larger base unit repeats.  MAX_WINDOW is the max number
of units in the base repeat.  CA=2, GAC=3, etc.  The code starts at 2
and works up to MAX_WINDOW.  This algorithm also takes into account the
fact that microsatellites can be imperfect (random extra bases thrown in)
and can be compound.  Let me know if you have any questions.

/* Copyright Whitehead Institute for Biomedical Research.  All rights
	reserved
 */
#include <stdio.h>
#include <string.h>
#include <math.h>
#include <time.h>

#define FALSE 0
#define TRUE  1



#define MAX_IMPERFECT_GAP 4    /* The default maximum number of imperfect
				* bases allowed in a microsatellite */
#define NUM_MS            100   /* Maximum number of microsatellites in a seq */
#define MAX_WINDOW        6   /* The maximum base repeat unit length */
#define REPEAT_THRESHOLD  10

#define MAX_LINE 500000
#define MARGIN 250

#define MAX_SUBSEQ    500000


typedef struct {
    int length;        /* Length of the microsatellite feature */
    int start;         /* Start of the feature in the whole sequence */
    char type[MAX_WINDOW+1];     /* Base repeat unit -- CA, TCA, GA, etc */
} REPEAT_INFO;

extern REPEAT_INFO *repeats;
int target_start, target_end;
int find_microsatellite();
void find_best_target();

REPEAT_INFO *repeats;
main(argc, argv)
int argc;
char **argv;
{
    int       length, current_num, genbank_start = 0, i, start, end, num_ms;
    int       start_o_target, length_o_target;
    char      type_o_target[MAX_WINDOW+1];
    char      seq[MAX_LINE], name[1000], match_seq[100], new_seq[MAX_LINE];
    char      description[1000], origin[1000], journal[1000];
    FILE      *ifp; 
    
    
    
    repeats = (REPEAT_INFO *) malloc ((NUM_MS+1) * sizeof(REPEAT_INFO));
    
    /* Check that the correct arguments are given */
    if (argc < 3) {
	printf("Usage: %s seq_file start_num\n", argv[0]);
	exit(1);}
    
    /* Open the files and set the target sequence*/
    if((ifp = fopen(argv[1], "r")) == NULL){
	printf("*** error: can't open %s\n", argv[1]); exit(1);}
    
    current_num = atoi(argv[2]);

    printf("1 set_organism %% |mus| %%\n");

    while(fgets(name, MAX_LINE, ifp) != NULL){
	name[strlen(name)-1] = '\0';
	fgets(description, MAX_LINE, ifp);
	description[strlen(description)-1] = '\0';
	fgets(origin, MAX_LINE, ifp);
	origin[strlen(origin)-1] = '\0';
	if(strcmp(origin, " ") == 0) strcpy(origin, "");
	fgets(journal, MAX_LINE, ifp);
	journal[strlen(journal)-1] = '\0';
	fgets(seq, MAX_LINE, ifp);
	seq[strlen(seq)-1] = '\0';
	length = (int)strlen(seq);
	num_ms = find_microsatellite(seq, length, MAX_WINDOW);
	if(num_ms == 1){
	    start = repeats[0].start;
	    end = start + repeats[0].length -1;
	    start_o_target = start;
	    length_o_target = end - start + 1;
	    strcpy(type_o_target,repeats[0].type);
	    if((start - MARGIN) <= 0)
	      start = 0;
	    else(start -= MARGIN);
   	    genbank_start = start;
	    start_o_target -= start;
	    if((end + MARGIN) > length)
	      end = length;
	    else end += MARGIN;
	    for(i = 0; i< (end-start+1); i++){
		new_seq[i] = seq[start+i];}
	    new_seq[i] ='\0';
	    strcpy(seq, new_seq);
	    
	}
	if(num_ms > 1){
	    find_best_target(num_ms); 
	    start = target_start;
	    end = target_end;
	    start_o_target = start;
	    length_o_target = end - start + 1;
	    strcpy(type_o_target,repeats[0].type); /* Not really the correct target
						      type in this case */
	    if((start - MARGIN) <= 0)
	      start = 0;
	    else(start -= MARGIN);
	    genbank_start = start;
	    start_o_target -= start;
	    if((end + MARGIN) > length)
	      end = length;
	    else end += MARGIN;
	    for(i = 0; i< (end-start+1); i++){
		new_seq[i] = seq[start+i];}
	    new_seq[i] ='\0';
	    strcpy(seq, new_seq);
	}
	
	if(num_ms > 0){
	    /* Print out the microsatellite with certain margins around it */
	    printf("1 put_marker %% |D%d| %%\n", current_num);
	    printf("2 put_marker_name %% |D%d| |%s| %%\n", current_num, name);
	    printf("3 put_genbank_info %% |D%d| || |%d| |%s| |%s| |%s| %%\n", 
		   current_num, genbank_start, origin, description, name);
	    printf("4 put_sequence %% |D%d| |%s| || |GenBank| |%s| %%\n",
		   current_num, seq, journal);
	    printf("5 put_insert %% |D%d| |0| |%d| %%\n", 
		   current_num, (int)strlen(seq));
	    printf("6 put_target %% |D%d| |%d| |%d| |%s| %%\n", 
		   current_num, start_o_target, length_o_target, type_o_target);
	    printf("5 put_FASTN %% |D%d| |0| || || %%\n", current_num);
	    printf("6 put_BLAST %% |D%d| |0| || || %%\n", current_num);
	    current_num++;
	    fflush(stdout);
	}

	
	
    }
fclose(ifp);
}

int find_microsatellite(seq, length, max_window)
char *seq;
int length, max_window;
{
    
    int window_length = 2, unit_matches, gap, rep_count=0  , i=0, start, k, j;
    char test_seq[MAX_SUBSEQ], repeat_type[MAX_WINDOW], double_repeat[MAX_WINDOW*2], gap_sequence[2*MAX_IMPERFECT_GAP+1];
    char *following_window, *leading_window, *next2, *p_start;
    
    /* Initialize the repeat structure */
    for(j = 0; j<NUM_MS; j++){
	repeats[j].start = 0;
	repeats[j].length = 0;
	strcpy(repeats[j].type, "");}
    
    /* Read in the target string to see if we are looking for
     * a number of a sequence of particular repeats
     * Check to make sure the window_length >= 2 <=MAX_WINDOW
     * Get max_window either from the specific repeat or
     * the number in parentheses
     */
    
    strncpy(test_seq,seq,length);  /* Make a copy to use */
    
    
    while(window_length <= max_window){ /* Loop over the sequence once for each window size */
	if(length > (window_length*2)){
	    
	    /* Initialize all the pointers for the next pass over the sequence */
	    following_window = test_seq;    /* The window following behind is set to start
					       at the beginning of the sequence */
	    leading_window = following_window + window_length;
	    /* The leading window is just ahead */
	    strncpy(repeat_type, following_window, window_length);
	    repeat_type[window_length] = '\0';
	    unit_matches = 0;
	    start = 0;
	    i = 0;
	    
	    while(*(leading_window + window_length -1) != 0){     /* Loop over the sequence */
		
		
		/* Check to see if we already found a repeat here */
		if(*following_window == 'X'){
		    following_window += window_length; 
		    leading_window += window_length ;
		    i += window_length;
		    strncpy(repeat_type, following_window, window_length);
		    repeat_type[window_length] = '\0';}
		else {
		    /* Check to see if the two windows match -- */
		    if(ms_match(repeat_type, leading_window, window_length)){
			if(unit_matches == 0){  /* we're possibly looking at a new repeat */
			    start = i;
			    strncpy(repeat_type, following_window, window_length);
			    repeat_type[window_length] = '\0';
			}
			unit_matches++;
			
			
			/* we'd rather have a repeat_type without an N */
			if(strchr(repeat_type, 'N') != NULL){
			    strncpy(repeat_type, following_window, window_length);
			    repeat_type[window_length] = '\0';}
			if(strchr(repeat_type, 'N') != NULL){
			    strncpy(repeat_type, leading_window, window_length);
			    repeat_type[window_length] = '\0';}
			
			
			following_window = leading_window = test_seq;
			following_window += i + window_length;  /* Advance the windows */
			leading_window += i + (window_length*2);
			i += window_length;
			
		    }
		    else{ 
			
			
			/* if we have a decent start look ahead to see if 
			   there's more after a short gap */
			strcpy(double_repeat, repeat_type);
			strcat(double_repeat, repeat_type);
			strncpy(gap_sequence, (test_seq + i + window_length), (2*MAX_IMPERFECT_GAP));
			gap_sequence[strlen(gap_sequence)-1] = '\0';
			next2 = strstr(gap_sequence, double_repeat);
			if((next2 != 0 ) && (unit_matches >=1)) {
			    next2 = strstr((test_seq + i + window_length), double_repeat);
			    following_window = next2;
			    i += (next2 - leading_window);
			    leading_window =  following_window + window_length;
			    unit_matches++; 
			}
			
			else{
			    /* no match, move ahead by one */
			    if(unit_matches >= (REPEAT_THRESHOLD-1)){
				repeats[rep_count].start = start;
				p_start = test_seq + start;
				repeats[rep_count].length =  leading_window - p_start;
				strcpy(repeats[rep_count].type,repeat_type);
				
				/* Mark the sequence as having a repeat so we don't go over it again */
				for(k = 0; k < (repeats[rep_count].length); k++)
				  test_seq[(repeats[rep_count].start) + k] = 'X';
				
				rep_count++;}
			    unit_matches = 0;
			    leading_window++;
			    following_window++;
			    i++;
			    start = i;
			    strncpy(repeat_type, following_window, window_length); 
			    /* Null terminate the string */
			    repeat_type[window_length] = '\0';
			}}}}
	    
	    /* Deal with a repeat that goes right to the end */
	    if(unit_matches >= (REPEAT_THRESHOLD-1)){
		repeats[rep_count].start = start;
		p_start = test_seq + start;
		repeats[rep_count].length =  leading_window - p_start;
		strcpy(repeats[rep_count].type,repeat_type);
		
		
		/* Mark the sequence as having a repeat so we don't go over it again */
		for(k = 0; k < (repeats[rep_count].length); k++)
		  test_seq[(repeats[rep_count].start) + k] = 'X';
		
		rep_count++;}
	    
	    /* Move up the window size */
	    window_length++;
	}
	else { window_length++; }

    }
    return(rep_count);
    
}




int ms_match(repeat_type, leading_seq, window_length)
char *leading_seq, *repeat_type;
int window_length;
{
    
    int Ns, i, num_same = 0;
    
    /* Keep a count of the number of Ns ... we will reject the match if
     * there are too many */
    Ns = 0;
    
    for(i = 0; i < window_length; i++){
	if (!((repeat_type[i] == *(leading_seq+i)) || (*(leading_seq+i) == 'N'))) 
	  return(FALSE);
	/* Check that we don't have something of the form poly-A or poly-T */
	if (repeat_type[0] == repeat_type[i]) num_same++;
	if(*(leading_seq+i) == 'N') Ns++;
    }
    
    /* If we made it here we have a match unless the window has too many Ns */
    if(num_same == window_length) return(FALSE);
    return(Ns <= (window_length/2)); 
    
}


void find_best_target(num_repeats)
int num_repeats;
{
    int i, j, k;
    REPEAT_INFO temp;
    
    target_start = 0; target_end = 0;
    /* Sort the repeats according to start site, see if we can fit
     * all the repeats into one product */  
    for(i = 0; i < num_repeats; ++i)
      for(j = num_repeats - 1; j > i; --j)
	if(repeats[j-1].start > repeats[j].start){
	    temp = repeats[j-1];
	    repeats[j-1] = repeats[j];
	    repeats[j] = temp;}
    
    if((repeats[num_repeats-1].start + repeats[num_repeats-1].length - repeats[0].start) < 1200){
	target_start = repeats[0].start;
	target_end = repeats[num_repeats-1].start + repeats[num_repeats-1].length - 1;}
    
    else{
	for(i = 0; i < num_repeats; ++i)
	  for(j = num_repeats - 1; j > i; --j)
	    if(repeats[j-1].length < repeats[j].length){
		temp = repeats[j-1];
		repeats[j-1] = repeats[j];
		repeats[j] = temp;}
	target_start = repeats[0].start;
	target_end = repeats[0].start + repeats[0].length -1;
    } 
}


int same_repeat_type(found_type, specified_type)
char *found_type, *specified_type;
{
    
    int found_length, specified_length;
    
    found_length = strlen(found_type);
    specified_length = strlen(specified_type);
    
    if(found_length == specified_length){ 
	if((strcmp(found_type, specified_type) == 0)) return(TRUE);}
    return(FALSE);
    
}




More information about the Bio-soft mailing list