/* EGAD: sequence_CHROMOSOME_stuff.cpp Navin Pokala and Tracy Handel Dept. of Molecular and Cell Biology University of California, Berkeley Copyright (C) 2003 Regents of the University of California GNU Public License Aug 12 2003 Absolutely no warranties are made or are implied with the use of this program or its parts. This file contains functions converting sequences into CHROMOSOME and vice versa. Useful for output and possibly useful for sequence space searching. */ #include "sequence_CHROMOSOME_stuff.h" /* generates amino acid sequence corresponding to the chromosome chr; this version includes only the positions with more than one residue choice. sequence must be allocated by the calling function */ void CHROMOSOME_to_variable_sequence(CHROMOSOME *chr, char *sequence) { int i; chr->genes = chr->firstgene; i=0; while(chr->genes->seq_position!=ENDFLAG) { if(chr->genes->varpos_ptr->fixed_flag==0) if(chr->genes->varpos_ptr->number_of_choices>1) { sequence[i] = chr->genes->choice_ptr->resparam_ptr->one_letter_code[0]; ++i; } chr->genes = chr->genes->nextgene; } chr->genes = chr->firstgene; sequence[i] = '\0'; } /* generates amino acid sequence corresponding to the chromosome chr; invar_pos used to include pro,gly in the sequence. sequence must be allocated by the calling function */ void CHROMOSOME_to_sequence(CHROMOSOME *chr, INVARIABLE_POSITIONS *invar_pos, char *sequence) { int i,j; static int missing_atoms_warning=0; chr->genes = chr->firstgene; i=0; j=1; while(chr->genes->seq_position!=ENDFLAG) { if(invar_pos[j].seq_position!=ENDFLAG) { while(chr->genes->seq_position < invar_pos[j].seq_position && chr->genes->seq_position!=ENDFLAG) { sequence[i] = chr->genes->choice_ptr->resparam_ptr->one_letter_code[0]; ++i; if(missing_atoms_warning==0) { if(chr->genes->nextgene!=NULL) if(chr->genes->nextgene->seq_position - chr->genes->seq_position > 1) /* missing atoms or new chain */ if(strcmp(chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text, chr->genes->nextgene->varpos_ptr->seqpos_text_map_ptr->seqpos_text)==0) { /* same chain; we must have missing atoms */ fprintf(stderr,"WARNING CHROMOSOME_to_sequence is being used for a structure with missing atoms.\n"); fprintf(stderr,"\tThe returned sequence will not reflect these gaps.\n"); missing_atoms_warning=1; } } chr->genes = chr->genes->nextgene; } sequence[i] = invar_pos[j].restype; ++i; ++j; } else while(chr->genes->seq_position!=ENDFLAG) { sequence[i] = chr->genes->choice_ptr->resparam_ptr->one_letter_code[0]; ++i; if(missing_atoms_warning==0) { if(chr->genes->nextgene!=NULL) if(chr->genes->nextgene->seq_position - chr->genes->seq_position > 1) /* missing atoms or new chain */ if(strcmp(chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text, chr->genes->nextgene->varpos_ptr->seqpos_text_map_ptr->seqpos_text)==0) { /* same chain; we must have missing atoms */ fprintf(stderr,"WARNING CHROMOSOME_to_sequence is being used for a structure with missing atoms.\n"); fprintf(stderr,"\tThe returned sequence will not reflect these gaps.\n"); missing_atoms_warning=1; } } chr->genes = chr->genes->nextgene; } } while(invar_pos[j].seq_position!=ENDFLAG) { sequence[i] = invar_pos[j].restype; ++i; ++j; } chr->genes = chr->firstgene; sequence[i] = '\0'; } /* Generates a chromosome with the appropriate pointers etc corresponding to a protein sequence chr must be pre-intialized/allocated with a GENE linked list (using inoculate sidechains) sequence must end with '\0' */ void sequence_to_CHROMOSOME(CHROMOSOME *chr, INVARIABLE_POSITIONS *invar_pos, char *sequence) { int seq_ctr,invar_ctr,res; char *errorline=NULL; static int missing_atoms_warning=0; chr->genes = chr->firstgene; seq_ctr=0; invar_ctr=1; while(sequence[seq_ctr]!='\0') { if(invar_pos[invar_ctr].seq_position!=ENDFLAG) /* have invariable positions (ie: pro/gly) */ { while(chr->genes->seq_position < invar_pos[invar_ctr].seq_position && chr->genes->seq_position!=ENDFLAG) { res=1; while(sequence[seq_ctr] != chr->genes->varpos_ptr->choice[res].resparam_ptr->one_letter_code[0] && res <= chr->genes->varpos_ptr->number_of_choices) ++res; if(res>chr->genes->varpos_ptr->number_of_choices) { errorline = (char *)calloc(MAXLINE,sizeof(char)); sprintf(errorline,"ERROR (1) residue %c for position %s is not an available option\n",sequence[seq_ctr], chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text); failure_report(errorline,"exit"); } CHOICE_to_GENE(chr->genes, chr->genes->varpos_ptr->choice[res], res); if(missing_atoms_warning==0) { if(chr->genes->nextgene!=NULL) if(chr->genes->nextgene->seq_position - chr->genes->seq_position > 1) /* missing atoms or new chain */ if(strcmp(chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text, chr->genes->nextgene->varpos_ptr->seqpos_text_map_ptr->seqpos_text)==0) { /* same chain; we must have missing atoms */ failure_report("WARNING sequence_to_CHROMOSOME is being used for a structure with missing atoms.\n\tMake sure that the sequence argument does not include the missing positions.","warn"); missing_atoms_warning=1; } } ++seq_ctr; chr->genes = chr->genes->nextgene; } ++seq_ctr; ++invar_ctr; } else while(chr->genes->seq_position!=ENDFLAG) /* finished w/ invariable positions */ { res=1; while(sequence[seq_ctr] != chr->genes->varpos_ptr->choice[res].resparam_ptr->one_letter_code[0] && res <= chr->genes->varpos_ptr->number_of_choices) ++res; if(res>chr->genes->varpos_ptr->number_of_choices) { errorline = (char *)calloc(MAXLINE,sizeof(char)); sprintf(errorline,"ERROR (2) residue %c for position %s is not an available option\n",sequence[seq_ctr], chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text); failure_report(errorline,"exit"); } CHOICE_to_GENE(chr->genes, chr->genes->varpos_ptr->choice[res], res); if(missing_atoms_warning==0) { if(chr->genes->nextgene->seq_position - chr->genes->seq_position > 1) /* missing atoms or new chain */ if(chr->genes->nextgene!=NULL) if(strcmp(chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text, chr->genes->nextgene->varpos_ptr->seqpos_text_map_ptr->seqpos_text)==0) { /* same chain; we must have missing atoms */ failure_report("WARNING sequence_to_CHROMOSOME is being used for a structure with missing atoms.\n\tMake sure that the sequence argument does not include the missing positions.","warn"); missing_atoms_warning=1; } } ++seq_ctr; chr->genes = chr->genes->nextgene; } } chr->genes = chr->firstgene; } /* Generates a chromosome with the appropriate pointers etc corresponding to a sequence of variable positions */ void variable_sequence_to_CHROMOSOME(CHROMOSOME *chr, char *sequence) { int seq_ctr,res; chr->genes = chr->firstgene; seq_ctr=0; while(sequence[seq_ctr]!='\0' && chr->genes->seq_position!=ENDFLAG) { if(chr->genes->varpos_ptr->fixed_flag==0) if(chr->genes->varpos_ptr->number_of_choices>1) { res=1; while(sequence[seq_ctr] != chr->genes->varpos_ptr->choice[res].resparam_ptr->one_letter_code[0] && res <= chr->genes->varpos_ptr->number_of_choices) ++res; if(res>chr->genes->varpos_ptr->number_of_choices) { fprintf(stderr,"ERROR (1) residue %c for position %s is not an available option\n",sequence[seq_ctr], chr->genes->varpos_ptr->seqpos_text_map_ptr->seqpos_text); exit(1); } CHOICE_to_GENE(chr->genes, chr->genes->varpos_ptr->choice[res], res); ++seq_ctr; } /* advance to next moving position */ chr->genes = chr->genes->nextgene; } chr->genes = chr->firstgene; } /* this function extracts the sequence composition for all 26 ACDEFGHIKLMNPQRSTVWYcdehky residuetypes in sequence, and places the number of each residue into composition; for example sequence = "AAFGHLMN" composition[1] = 2; (A) composition[5] = 1; (F) composition[6] = 1; (G) composition[7] = 1; (H) composition[11] = 1; (M) composition[12] = 1; (N) */ void extract_composition(int *composition, char *sequence) { char restypes[30]; int i,j; strcpy(restypes,"_ACDEFGHIKLMNPQRSTVWYcdehky"); for(i=0;i<=30;++i) composition[i]=0; i=0; while(sequence[i]!='\0') { j=1; while(restypes[j]!=sequence[i]) ++j; ++composition[j]; ++i; } }