MultiSource/Applications/SIBsim4/sim4.init.c - llvm-test-suite - Git at Google

 /* $Id$
  *
  * Christian Iseli, LICR ITO, Christian.Iseli@licr.org
  *
  * Copyright (c) 2001-2006 Swiss Institute of Bioinformatics.
  * Copyright (C) 1998-2001  Liliana Florea.
  */

 /*
  * TODO
  *  - See why tst23 is split
  *  - See why tst50 has strange alignment
  */

 /*
 * sim4 - Align a cDNA sequence with a genomic sequence for it.
 *
 * The basic command syntax is
 *
 *       sim4 [options] dna.seq rna.seq
 *
 * where dna.seq names a file containing a DNA sequence and rna.seq
 * names a file containing one or more RNA sequences.
 * The files are to be in FASTA format.  Thus a typical sequence file
 * might begin:
 *
 *       >BOVHBPP3I  Bovine beta-globin psi-3 pseudogene, 5' end.
 *       GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
 *       GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
 */

 #ifdef __sun
 #define _XOPEN_SOURCE /* tell sun we want getopt, etc.  */
 #define _XPG5 /* and we want snprintf  */
 #endif
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <unistd.h>
 #include <string.h>
 #include <ctype.h>
 #include <errno.h>
 #include <locale.h>
 #include <signal.h>
 #include "sim4.h"
 #include "align.h"
 #include "misc.h"
 #include "sim4b1.h"
 #if defined(DEBUG) && (DEBUG > 1)
 #include <mcheck.h>
 #endif


 static void init_seq(const char *, seq_p_t);
 static int get_next_seq(seq_p_t, unsigned int, int);
 static void seq_revcomp_inplace(seq_p_t);
 static void print_align_lat(uchar *, uchar *, result_p_t);
 static void print_polyA_info(seq_p_t, seq_p_t, collec_p_t, sim4_stats_p_t);
 static void print_res(result_p_t, int, seq_p_t, seq_p_t);
 static void init_splice_junctions(void);
 static void bug_handler(int);
 #ifdef DEBUG
 static void free_seq(seq_p_t);
 #endif

 static const char Usage[] =
 "%s [options] dna est_db\n\n"
 "This is SIBsim4 version 0.14.\n\n"
 #ifdef DEBUG
 "Debug version\n\n"
 #endif
 "Available options (default value in braces[]):\n"
 "  -A <int>  output format\n"
 "             0: exon endpoints only\n"
 "             1: alignment text\n"
 "             3: both exon endpoints and alignment text\n"
 "             4: both exon endpoints and alignment text with polyA info\n"
 "            Note that 2 is unimplemented [%d]\n"
 "  -C <int>  MSP score threshold for the second pass [%d]\n"
 "  -c <int>  minimum score cutoff [%d]\n"
 "  -E <int>  cutoff value [%d]\n"
 "  -f <int>  score filter in percent (0 to disable filtering) [%d]\n"
 "  -g <int>  join exons when gap on genomic and RNA have lengths which\n"
 "            differ at most by this percentage [%d]\n"
 "  -I <int>  window width in which to search for intron splicing [%d]\n"
 "  -K <int>  MSP score threshold for the first pass [%d]\n"
 "  -L <str>  a comma separated list of forward splice-types [%s]\n"
 "  -M <int>  scoring splice sites, evaluate match within M nucleotides [%d]\n"
 "  -o <int>  offset nt positions in dna sequence by this amount [%u]\n"
 "  -q <int>  penalty for a nucleotide mismatch [%d]\n"
 "  -R <int>  direction of search\n"
 "             0: search the '+' (direct) strand only\n"
 "             1: search the '-' strand only\n"
 "             2: search both strands and report the best match\n"
 "            [%d]\n"
 "  -r <int>  reward for a nucleotide match [%d]\n"
 "  -W <int>  word size [%d]\n"
 "  -X <int>  value for terminating word extensions [%d]\n";

 options_t options;
 char *argv0;
 char dna_seq_head[256];
 char rna_seq_head[256];

 int
 main(int argc, char *argv[])
 {
   int count;
   seq_t seq1, seq2;
   hash_env_t he;
   collec_t res, rev_res;
 #if defined(DEBUG) && (DEBUG > 1)
   mcheck(NULL);
   mtrace();
 #endif
   argv0 = argv[0];
   if (setlocale(LC_ALL, "POSIX") == NULL)
     fprintf(stderr, "%s: Warning: could not set locale to POSIX\n", argv[0]);
   signal(SIGSEGV, bug_handler);
 #ifndef __MINGW32__
   signal(SIGBUS, bug_handler);
 #endif
   /* Default options.  */
   options.C = DEFAULT_C;
   options.cutoff = DIST_CUTOFF;
   options.gapPct = DEFAULT_GAPPCT;
   options.intron_window = 6;
   options.K = DEFAULT_K;
   options.splice_type_list = "GTAG,GCAG,GTAC,ATAC";
   options.nbSplice = 4;
   options.scoreSplice_window = 10;
   options.mismatchScore = MISMATCH;
   options.reverse = 2;
   options.matchScore = MATCH;
   options.W = DEFAULT_W;
   options.X = DEFAULT_X;
   options.filterPct = DEFAULT_FILTER;
   options.minScore_cutoff = MATCH_CUTOFF;
   while (1) {
     int c = getopt(argc, argv, "A:C:c:E:f:g:I:K:L:M:o:q:R:r:W:X:");
     if (c == -1)
       break;
     switch (c) {
     case 'A':
       options.ali_flag = atoi(optarg);
       if (options.ali_flag < 0 || options.ali_flag > 4)
 	fatal("A must be one of 0, 1, 2, 3, or 4.\n");
       break;
     case 'C': {
       int val = atoi(optarg);
       if (val < 0)
 	fatal("Value for option C must be non-negative.\n");
       options.C = val;
       break;
     }
     case 'c': {
       int val = atoi(optarg);
       if (val < 0)
 	fatal("Value for option c must be non-negative.\n");
       options.minScore_cutoff = val;
       break;
     }
     case 'E':
       options.cutoff = atoi(optarg);
       if (options.cutoff < 3 || options.cutoff > 10)
 	fatal("Cutoff (E) must be within [3,10].\n");
       break;
     case 'f':
       options.filterPct = atoi(optarg);
       if (options.filterPct > 100)
 	fatal("Filter in percent (f) must be within [0,100].\n");
       break;
     case 'g':
       options.gapPct = atoi(optarg);
       break;
     case 'I':
       options.intron_window = atoi(optarg);
       break;
     case 'K': {
       int val = atoi(optarg);
       if (val < 0)
 	fatal("Value for option K must be non-negative.\n");
       options.K = val;
       break;
     }
     case 'L': {
       size_t i;
       size_t len = strlen(optarg);
       options.splice_type_list = optarg;
       options.nbSplice = 1;
       if (len % 5 != 4)
 	fatal("Splice types list has illegal length (%zu)\n", len);
       for (i = 0; i < len; i++)
 	if (i % 5 == 4) {
 	  if (options.splice_type_list[i] != ',')
 	    fatal("Comma expected instead of %c at position %zu"
 		  "in splice types list.\n",
 		  options.splice_type_list[i], i);
 	  options.nbSplice += 1;
 	} else {
 	  if (options.splice_type_list[i] != 'A'
 	      && options.splice_type_list[i] != 'C'
 	      && options.splice_type_list[i] != 'G'
 	      && options.splice_type_list[i] != 'T')
 	    fatal("Expected 'A', 'C', 'G' or 'T' instead of '%c' at"
 		  "position %zu in splice types list.\n",
 		  options.splice_type_list[i], i);
 	}
       break;
     }
     case 'M': {
       int val = atoi(optarg);
       if (val < 0)
 	fatal("Value for option M must be non-negative.\n");
       options.scoreSplice_window = val;
       break;
     }
     case 'o':
       options.dnaOffset = atoi(optarg);
       break;
     case 'q':
       options.mismatchScore = atoi(optarg);
       break;
     case 'R':
       options.reverse = atoi(optarg);
       if (options.reverse < 0 || options.reverse > 2)
 	fatal("R must be one of 0, 1, or 2.\n");
       break;
     case 'r':
       options.matchScore = atoi(optarg);
       break;
     case 'W':
       options.W = atoi(optarg);
       if (options.W < 1 || options.W > 15)
 	fatal("W must be within [1,15].\n");
       break;
     case 'X':
       options.X = atoi(optarg);
       if (options.X < 1)
 	fatal("X must be positive.\n");
       break;
     case '?':
       break;
     default:
       fprintf(stderr, "?? getopt returned character code 0%o ??\n", c);
     }
   }
   if (optind + 2 != argc) {
     fprintf(stderr, Usage, argv[0], options.ali_flag, options.C,
 	    options.minScore_cutoff, options.cutoff,
 	    options.filterPct, options.gapPct, options.intron_window,
 	    options.K, options.splice_type_list, options.scoreSplice_window,
 	    options.dnaOffset, options.mismatchScore, options.reverse,
 	    options.matchScore, options.W, options.X);
     return 1;
   }

   /* read seq1 */
   init_seq(argv[optind], &seq1);
   if (get_next_seq(&seq1, options.dnaOffset, 1) != 0)
     fatal("Cannot read sequence from %s.\n", argv[optind]);
   strncpy(dna_seq_head, seq1.header, 256);

   /* read seq2 */
   init_seq(argv[optind + 1], &seq2);
   if (get_next_seq(&seq2, 0, 0) != 0)
     fatal("Cannot read sequence from %s.\n", argv[optind + 1]);

   init_encoding();
   init_hash_env(&he, options.W, seq1.seq, seq1.len);
   init_col(&res, 1);
   init_col(&rev_res, 1);
   bld_table(&he);
   init_splice_junctions();

   count = 0;
   while (!count || get_next_seq(&seq2, 0, 0) == 0) {
     unsigned int curRes;
     strncpy(rna_seq_head, seq2.header, 256);
     ++count;

     switch (options.reverse) {
     case  0:
       SIM4(&he, &seq2, &res);
       break;
     case  2:
       SIM4(&he, &seq2, &res);
     case  1:
       seq_revcomp_inplace(&seq2);
       SIM4(&he, &seq2, &rev_res);
       break;
     default:
       fatal ("Unrecognized request for EST orientation.\n");
     }
     /* Keep only the best matches, according to filterPct.  */
     if (options.filterPct > 0) {
       unsigned int max_nmatches = 0;
       for (curRes = 0; curRes < rev_res.nb; curRes++) {
 	result_p_t r = rev_res.e.result[curRes];
 	if (r->st.nmatches > max_nmatches)
 	  max_nmatches = r->st.nmatches;
       }
       for (curRes = 0; curRes < res.nb; curRes++) {
 	result_p_t r = res.e.result[curRes];
 	if (r->st.nmatches > max_nmatches)
 	  max_nmatches = r->st.nmatches;
       }
       max_nmatches = (max_nmatches * options.filterPct) / 100;
       for (curRes = 0; curRes < rev_res.nb; curRes++) {
 	result_p_t r = rev_res.e.result[curRes];
 	if (r->st.nmatches < max_nmatches)
 	  r->st.nmatches = 0;
       }
       for (curRes = 0; curRes < res.nb; curRes++) {
 	result_p_t r = res.e.result[curRes];
 	if (r->st.nmatches < max_nmatches)
 	  r->st.nmatches = 0;
       }
     }
     /* Now, print results.  */
     for (curRes = 0; curRes < rev_res.nb; curRes++)
       print_res(rev_res.e.result[curRes], 1, &seq1, &seq2);
     rev_res.nb = 0;
     if (options.reverse && options.ali_flag)
       /* reverse-complement back seq2 for alignment */
       seq_revcomp_inplace(&seq2);
     for (curRes = 0; curRes < res.nb; curRes++)
       print_res(res.e.result[curRes], 0, &seq1, &seq2);
     res.nb = 0;
   }
 #ifdef DEBUG
   fprintf(stderr, "DEBUG mode: freeing all memory...\n");
   fflush(stdout);
   fflush(stderr);
   free_hash_env(&he);
   free_seq(&seq1);
   free_seq(&seq2);
   free(options.splice);
   free(res.e.elt);
   free(rev_res.e.elt);
 #endif
   return 0;
 }

 static const unsigned char dna_complement[256] =
   "                                                                "
   " TVGH  CD  M KN   YSA BWXR       tvgh  cd  m kn   ysa bwxr      "
   "                                                                "
   "                                                                ";
 /* ................................................................ */
 /* @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~. */
 /* ................................................................ */
 /* ................................................................ */

 static void
 init_splice_junctions(void)
 {
   unsigned int i;
   options.splice = (junction_p_t) xmalloc(options.nbSplice
 			  		  * sizeof(junction_t));
   for (i = 0; i < options.nbSplice; i++) {
     unsigned int j;
     for (j = 0; j < 4; j++) {
       uchar c = options.splice_type_list[i * 5 + j];
       options.splice[i].fwd[j] = c;
       options.splice[i].rev[3 - j] = dna_complement[c];
     }
   }
 #ifdef DEBUG
   for (i = 0; i < options.nbSplice; i++)
     fprintf(stderr, "Splice[%u]: %.4s %.4s\n",
 	    i, options.splice[i].fwd, options.splice[i].rev);
 #endif
 }

 static void
 print_res(result_p_t res, int rev, seq_p_t seq1, seq_p_t seq2)
 {
   unsigned int i;
   if (res->st.nmatches >= options.minScore_cutoff) {
     printf("\n%s%s\n", seq1->header, seq2->header);
     if (rev)
       printf("(complement)\n\n");
     switch (options.ali_flag) {
     case 0:
       print_exons(&res->eCol, res->direction);
       break;
     case 1:
       print_align_lat(seq1->seq, seq2->seq, res);
       break;
     case 3:
       print_exons(&res->eCol, res->direction);
       print_align_lat(seq1->seq, seq2->seq, res);
       break;
     case 4:
       print_exons(&res->eCol, res->direction);
       print_polyA_info(seq1, seq2, &res->eCol, &res->st);
       print_align_lat(seq1->seq, seq2->seq, res);
       break;
     default:
       fatal("Unrecognized option for alignment output.\n");
     }
     printf("\n");
   }
   for (i = 0; i < res->eCol.nb; i++)
     free(res->eCol.e.elt[i]);
   free(res->eCol.e.elt);
   if (res->sList)
     free_align(res->sList);
   free(res);
 }

 static void
 print_polyA_info(seq_p_t s1, seq_p_t s2, collec_p_t eCol,
 		 sim4_stats_p_t st)
 {
   if (st->polyA_cut) {
     unsigned int cnt = 0, cntDna = 0, pos, i, scanLen = 50;
     char *pSig, buf[51];
     exon_p_t e = eCol->e.exon[eCol->nb - 1];
     for (pos = 0; pos < 10 && e->to2 + pos < s2->len; pos++)
       if (s2->seq[e->to2 + pos] == 'A')
 	cnt += 1;
     while (e->to2 + pos < s2->len && s2->seq[e->to2 + pos] == 'A') {
       pos += 1;
       cnt += 1;
     }
     for (i = 0; i < s1->len && i < pos; i++)
       if (s1->seq[e->to1 + i] == 'A')
 	cntDna += 1;
     printf("\nPolyA site %u nt, %u/%u A's %u\n R %.*s %u\n D %*.*s %u\n",
 	   pos, cnt, cntDna, e->to1 + 1 + options.dnaOffset,
 	   pos, s2->seq + e->to2, e->to2 + 1,
 	   pos, i, s1->seq + e->to1, e->to1 + 1 + options.dnaOffset);
     if (e->to1 < scanLen)
       scanLen = e->to1;
     strncpy(buf, (char *) s1->seq + e->to1 - scanLen, scanLen);
     buf[scanLen] = 0;
     pSig = strstr(buf, "AATAAA");
     if (pSig == NULL)
       pSig = strstr(buf, "ATTAAA");
     if (pSig != NULL)
       printf("PolyA signal %u\n",
 	     (unsigned int) (pSig - buf + e->to1 - scanLen + 1
 			     + options.dnaOffset));
   }
   if (st->polyT_cut) {
     unsigned int cnt = 0, cntDna = 0, pos, i;
     char *pSig, buf[51];
     exon_p_t e = eCol->e.exon[0];
     for (pos = 0; pos < 10 && pos < e->from2 - 1; pos++)
       if (s2->seq[e->from2 - 2 - pos] == 'T')
 	cnt += 1;
     while (pos < e->from2 - 1 && s2->seq[e->from2 - 2 - pos] == 'T') {
       pos += 1;
       cnt += 1;
     }
     for (i = 0; i < e->from1 - 1 && i < pos; i++)
       if (s1->seq[e->from1 - 2 - i] == 'T')
 	cntDna += 1;
     printf("\nPolyA site %u nt, %u/%u A's %u minus strand\n R %.*s %u\n D %*.*s %u\n",
 	   pos, cnt, cntDna, e->from1 - 1 + options.dnaOffset,
 	   pos, s2->seq + (e->from2 - 1 - pos), e->from2 - 1,
 	   pos, i, s1->seq + (e->from1 - 1 - i),
 	   e->from1 - 1 + options.dnaOffset);
     strncpy(buf, (char *) s1->seq + e->from1 - 1, 50);
     buf[50] = 0;
     pSig = strstr(buf, "TTTATT");
     if (pSig == NULL)
       pSig = strstr(buf, "TTTAAT");
     if (pSig != NULL)
       printf("PolyA signal %u minus strand\n",
 	     (unsigned int) (pSig - buf + e->from1 + 5 + options.dnaOffset));
   }
 }

 static void
 print_align_lat(uchar *seq1, uchar *seq2, result_p_t r)
 {
   int *S;
   edit_script_list_p_t head, aligns;

   if (r->sList == NULL)
     return;
   aligns = r->sList;
   while (aligns != NULL) {
     head = aligns;
     aligns = aligns->next_script;
     S = (int *) xmalloc((2 * head->len2 + 1 + 1) * sizeof(int));
     S++;
     S2A(head->script, S, 0);
     Free_script(head->script);
     IDISPLAY(seq1 + head->offset1 - 1 - 1, seq2 + head->offset2 - 1 - 1,
 	     head->len1, head->len2, S,
 	     head->offset1, head->offset2, &r->eCol, r->direction);
     free(S - 1);
     free(head);
   }
   r->sList = NULL;
 }

 #ifdef DEBUG
 static void
 free_buf(read_buf_p_t b)
 {
   free(b->line);
 }

 static void
 free_seq(seq_p_t sp)
 {
   free(sp->seq);
   free(sp->header);
   free_buf(&sp->rb);
   if (sp->fName != NULL)
     close(sp->fd);
 }
 #endif

 static void
 grow_read_buf(read_buf_p_t b)
 {
   b->lmax += BUF_SIZE;
   b->line = xrealloc(b->line, b->lmax * sizeof(char));
 }

 static char *
 shuffle_line(read_buf_p_t b, size_t *cur)
 {
   if (b->ic == 0 || *cur >= b->ic)
     return NULL;
   /* Make sure we have enough room in line.  */
   if (b->lmax <= b->lc + (b->ic - *cur))
     grow_read_buf(b);
   while (*cur < b->ic && b->in[*cur] != '\n')
     b->line[b->lc++] = b->in[(*cur)++];
   if (*cur < b->ic) {
     /* Ok, we have our string.  */
     /* Copy the newline.  */
     b->line[b->lc++] = b->in[(*cur)++];
     /* We should be fine, since we read BUF_SIZE -1 at most...  */
     b->line[b->lc] = 0;
     /* Adjust the input buffer.  */
     if (*cur < b->ic) {
       memmove(b->in, b->in + *cur, (b->ic - *cur) * sizeof(char));
       b->ic -= *cur;
     } else
       b->ic = 0;
     *cur = 0;
     return b->line;
   }
   /* Go read some more.  */
   b->ic = 0, *cur = 0;
   return NULL;
 }

 static char *
 read_line_buf(read_buf_p_t b, int fd)
 {
   char *s = NULL;
   ssize_t rc;
   size_t cur = 0;
   b->lc = 0;
   if ((s = shuffle_line(b, &cur)) != NULL)
     return s;
   do {
     if ((rc = read(fd, b->in + b->ic, BUF_SIZE - b->ic - 1)) == -1) {
       if (errno != EINTR)
 	fatal("Could not read from %d: %s(%d)\n",
 	      fd, strerror(errno), errno);
     } else
       b->ic += rc;
     s = shuffle_line(b, &cur);
     if (s == NULL && rc == 0) {
       /* Got to the EOF...  */
       b->line[b->lc] = 0;
       s = b->line;
     }
   } while (s == NULL);
   return s;
 }

 static void
 init_buf(read_buf_p_t b)
 {
   b->line = xmalloc(BUF_SIZE * sizeof(char));
   b->lmax = BUF_SIZE;
   b->lc = 0;
   b->ic = 0;
 }

 static void
 init_seq(const char *fName, seq_p_t sp)
 {
   sp->fName = fName;
   sp->header = NULL;
   sp->seq = NULL;
   init_buf(&sp->rb);
   if (fName != NULL) {
     sp->fd = open(fName, O_RDONLY);
     if (sp->fd == -1)
       fatal("Could not open file %s: %s(%d)\n",
 	    fName, strerror(errno), errno);
   } else
     sp->fd = 0;
   sp->len = 0;
   sp->maxHead = 0;
   sp->max = 0;
   read_line_buf(&sp->rb, sp->fd);
 }

 static int
 get_next_seq(seq_p_t sp, unsigned int offset, int warnMultiSeq)
 {
   const int lenStr = 24;
   unsigned int headerLen;
   char *buf = sp->rb.line;
   int res;
   while (sp->rb.lc > 0 && buf[0] != '>')
     buf = read_line_buf(&sp->rb, sp->fd);
   if (sp->rb.lc == 0)
     return -1;
   /* We have the FASTA header.  */
   if (sp->rb.lc + lenStr + 1 > sp->maxHead) {
     sp->maxHead = sp->rb.lc + lenStr + 1;
     sp->header = (char *) xrealloc(sp->header, sp->maxHead * sizeof(char));
   }
   headerLen = sp->rb.lc;
   memcpy(sp->header, buf, (sp->rb.lc + 1) * sizeof(char));
   sp->len = 0;
   buf = read_line_buf(&sp->rb, sp->fd);
   while (sp->rb.lc > 0 && buf[0] != '>') {
     unsigned char c;
     /* Make sure we have enough room for this additional line.  */
     if (sp->len + sp->rb.lc + 1 > sp->max) {
       sp->max = max(sp->len + sp->rb.lc + 1,
 		    sp->max + 0x40000);
       sp->seq = (unsigned char *)
 	xrealloc(sp->seq, sp->max * sizeof(unsigned char));
     }
     while ((c = *buf++) != 0) {
       if (isupper(c)) {
 	sp->seq[sp->len++] = c;
       } else if (islower(c)) {
 	sp->seq[sp->len++] = toupper(c);
       }
     }
     buf = read_line_buf(&sp->rb, sp->fd);
   }
   if (warnMultiSeq && sp->rb.lc > 0)
     fprintf(stderr, "\n"
 	    "***  WARNING                                           ***\n"
 	    "***  there appears to be several sequences in the DNA  ***\n"
 	    "***  sequence file.  Only the first one will be used,  ***\n"
 	    "***  which might not be what was intended.             ***\n"
 	    "\n");
   sp->seq[sp->len] = 0;
   buf = strstr(sp->header, "; LEN=");
   if (buf) {
     char *s = buf + 6;
     headerLen -= 6;
     while (isdigit(*s)) {
       s += 1;
       headerLen -= 1;
     }
     while (*s)
       *buf++ = *s++;
   }
   buf = sp->header + headerLen - 1;
   while (iscntrl(*buf) || isspace(*buf))
     buf -= 1;
   res = snprintf(buf + 1, lenStr, "; LEN=%u\n", sp->len + offset);
   if (res < 0 || res >= lenStr)
     fatal("Sequence too long: %u\n", sp->len);
   return 0;
 }

 static void
 seq_revcomp_inplace(seq_p_t seq)
 {
   unsigned char *s = seq->seq;
   unsigned char *t = seq->seq + seq->len;
   unsigned char c;
   while (s < t) {
     c = dna_complement[*--t];
     *t = dna_complement[*s];
     *s++ = c;
   }
 }

 static void
 bug_handler(int signum)
 {
   fflush(stdout);
   fflush(stderr);
   fprintf(stderr, "\nCaught signal %d while processing:\n%.256s\n%.256s\n",
 	  signum, dna_seq_head, rna_seq_head);
   abort();
 }
	/* $Id$
	*
	* Christian Iseli, LICR ITO, Christian.Iseli@licr.org
	*
	* Copyright (c) 2001-2006 Swiss Institute of Bioinformatics.
	* Copyright (C) 1998-2001 Liliana Florea.
	*/

	/*
	* TODO
	* - See why tst23 is split
	* - See why tst50 has strange alignment
	*/

	/*
	* sim4 - Align a cDNA sequence with a genomic sequence for it.
	*
	* The basic command syntax is
	*
	* sim4 [options] dna.seq rna.seq
	*
	* where dna.seq names a file containing a DNA sequence and rna.seq
	* names a file containing one or more RNA sequences.
	* The files are to be in FASTA format. Thus a typical sequence file
	* might begin:
	*
	* >BOVHBPP3I Bovine beta-globin psi-3 pseudogene, 5' end.
	* GGAGAATAAAGTTTCTGAGTCTAGACACACTGGATCAGCCAATCACAGATGAAGGGCACT
	* GAGGAACAGGAGTGCATCTTACATTCCCCCAAACCAATGAACTTGTATTATGCCCTGGGC
	*/

	#ifdef __sun
	#define _XOPEN_SOURCE /* tell sun we want getopt, etc. */
	#define _XPG5 /* and we want snprintf */
	#endif
	#include <sys/types.h>
	#include <sys/stat.h>
	#include <fcntl.h>
	#include <stdlib.h>
	#include <stdio.h>
	#include <unistd.h>
	#include <string.h>
	#include <ctype.h>
	#include <errno.h>
	#include <locale.h>
	#include <signal.h>
	#include "sim4.h"
	#include "align.h"
	#include "misc.h"
	#include "sim4b1.h"
	#if defined(DEBUG) && (DEBUG > 1)
	#include <mcheck.h>
	#endif


	static void init_seq(const char *, seq_p_t);
	static int get_next_seq(seq_p_t, unsigned int, int);
	static void seq_revcomp_inplace(seq_p_t);
	static void print_align_lat(uchar , uchar , result_p_t);
	static void print_polyA_info(seq_p_t, seq_p_t, collec_p_t, sim4_stats_p_t);
	static void print_res(result_p_t, int, seq_p_t, seq_p_t);
	static void init_splice_junctions(void);
	static void bug_handler(int);
	#ifdef DEBUG
	static void free_seq(seq_p_t);
	#endif

	static const char Usage[] =
	"%s [options] dna est_db\n\n"
	"This is SIBsim4 version 0.14.\n\n"
	#ifdef DEBUG
	"Debug version\n\n"
	#endif
	"Available options (default value in braces[]):\n"
	" -A <int> output format\n"
	" 0: exon endpoints only\n"
	" 1: alignment text\n"
	" 3: both exon endpoints and alignment text\n"
	" 4: both exon endpoints and alignment text with polyA info\n"
	" Note that 2 is unimplemented [%d]\n"
	" -C <int> MSP score threshold for the second pass [%d]\n"
	" -c <int> minimum score cutoff [%d]\n"
	" -E <int> cutoff value [%d]\n"
	" -f <int> score filter in percent (0 to disable filtering) [%d]\n"
	" -g <int> join exons when gap on genomic and RNA have lengths which\n"
	" differ at most by this percentage [%d]\n"
	" -I <int> window width in which to search for intron splicing [%d]\n"
	" -K <int> MSP score threshold for the first pass [%d]\n"
	" -L <str> a comma separated list of forward splice-types [%s]\n"
	" -M <int> scoring splice sites, evaluate match within M nucleotides [%d]\n"
	" -o <int> offset nt positions in dna sequence by this amount [%u]\n"
	" -q <int> penalty for a nucleotide mismatch [%d]\n"
	" -R <int> direction of search\n"
	" 0: search the '+' (direct) strand only\n"
	" 1: search the '-' strand only\n"
	" 2: search both strands and report the best match\n"
	" [%d]\n"
	" -r <int> reward for a nucleotide match [%d]\n"
	" -W <int> word size [%d]\n"
	" -X <int> value for terminating word extensions [%d]\n";

	options_t options;
	char *argv0;
	char dna_seq_head[256];
	char rna_seq_head[256];

	int
	main(int argc, char *argv[])
	{
	int count;
	seq_t seq1, seq2;
	hash_env_t he;
	collec_t res, rev_res;
	#if defined(DEBUG) && (DEBUG > 1)
	mcheck(NULL);
	mtrace();
	#endif
	argv0 = argv[0];
	if (setlocale(LC_ALL, "POSIX") == NULL)
	fprintf(stderr, "%s: Warning: could not set locale to POSIX\n", argv[0]);
	signal(SIGSEGV, bug_handler);
	#ifndef __MINGW32__
	signal(SIGBUS, bug_handler);
	#endif
	/* Default options. */
	options.C = DEFAULT_C;
	options.cutoff = DIST_CUTOFF;
	options.gapPct = DEFAULT_GAPPCT;
	options.intron_window = 6;
	options.K = DEFAULT_K;
	options.splice_type_list = "GTAG,GCAG,GTAC,ATAC";
	options.nbSplice = 4;
	options.scoreSplice_window = 10;
	options.mismatchScore = MISMATCH;
	options.reverse = 2;
	options.matchScore = MATCH;
	options.W = DEFAULT_W;
	options.X = DEFAULT_X;
	options.filterPct = DEFAULT_FILTER;
	options.minScore_cutoff = MATCH_CUTOFF;
	while (1) {
	int c = getopt(argc, argv, "A:C:c:E:f:g:I:K:L:M:o:q:R:r:W:X:");
	if (c == -1)
	break;
	switch (c) {
	case 'A':
	options.ali_flag = atoi(optarg);
	if (options.ali_flag < 0 \|\| options.ali_flag > 4)
	fatal("A must be one of 0, 1, 2, 3, or 4.\n");
	break;
	case 'C': {
	int val = atoi(optarg);
	if (val < 0)
	fatal("Value for option C must be non-negative.\n");
	options.C = val;
	break;
	}
	case 'c': {
	int val = atoi(optarg);
	if (val < 0)
	fatal("Value for option c must be non-negative.\n");
	options.minScore_cutoff = val;
	break;
	}
	case 'E':
	options.cutoff = atoi(optarg);
	if (options.cutoff < 3 \|\| options.cutoff > 10)
	fatal("Cutoff (E) must be within [3,10].\n");
	break;
	case 'f':
	options.filterPct = atoi(optarg);
	if (options.filterPct > 100)
	fatal("Filter in percent (f) must be within [0,100].\n");
	break;
	case 'g':
	options.gapPct = atoi(optarg);
	break;
	case 'I':
	options.intron_window = atoi(optarg);
	break;
	case 'K': {
	int val = atoi(optarg);
	if (val < 0)
	fatal("Value for option K must be non-negative.\n");
	options.K = val;
	break;
	}
	case 'L': {
	size_t i;
	size_t len = strlen(optarg);
	options.splice_type_list = optarg;
	options.nbSplice = 1;
	if (len % 5 != 4)
	fatal("Splice types list has illegal length (%zu)\n", len);
	for (i = 0; i < len; i++)
	if (i % 5 == 4) {
	if (options.splice_type_list[i] != ',')
	fatal("Comma expected instead of %c at position %zu"
	"in splice types list.\n",
	options.splice_type_list[i], i);
	options.nbSplice += 1;
	} else {
	if (options.splice_type_list[i] != 'A'
	&& options.splice_type_list[i] != 'C'
	&& options.splice_type_list[i] != 'G'
	&& options.splice_type_list[i] != 'T')
	fatal("Expected 'A', 'C', 'G' or 'T' instead of '%c' at"
	"position %zu in splice types list.\n",
	options.splice_type_list[i], i);
	}
	break;
	}
	case 'M': {
	int val = atoi(optarg);
	if (val < 0)
	fatal("Value for option M must be non-negative.\n");
	options.scoreSplice_window = val;
	break;
	}
	case 'o':
	options.dnaOffset = atoi(optarg);
	break;
	case 'q':
	options.mismatchScore = atoi(optarg);
	break;
	case 'R':
	options.reverse = atoi(optarg);
	if (options.reverse < 0 \|\| options.reverse > 2)
	fatal("R must be one of 0, 1, or 2.\n");
	break;
	case 'r':
	options.matchScore = atoi(optarg);
	break;
	case 'W':
	options.W = atoi(optarg);
	if (options.W < 1 \|\| options.W > 15)
	fatal("W must be within [1,15].\n");
	break;
	case 'X':
	options.X = atoi(optarg);
	if (options.X < 1)
	fatal("X must be positive.\n");
	break;
	case '?':
	break;
	default:
	fprintf(stderr, "?? getopt returned character code 0%o ??\n", c);
	}
	}
	if (optind + 2 != argc) {
	fprintf(stderr, Usage, argv[0], options.ali_flag, options.C,
	options.minScore_cutoff, options.cutoff,
	options.filterPct, options.gapPct, options.intron_window,
	options.K, options.splice_type_list, options.scoreSplice_window,
	options.dnaOffset, options.mismatchScore, options.reverse,
	options.matchScore, options.W, options.X);
	return 1;
	}

	/* read seq1 */
	init_seq(argv[optind], &seq1);
	if (get_next_seq(&seq1, options.dnaOffset, 1) != 0)
	fatal("Cannot read sequence from %s.\n", argv[optind]);
	strncpy(dna_seq_head, seq1.header, 256);

	/* read seq2 */
	init_seq(argv[optind + 1], &seq2);
	if (get_next_seq(&seq2, 0, 0) != 0)
	fatal("Cannot read sequence from %s.\n", argv[optind + 1]);

	init_encoding();
	init_hash_env(&he, options.W, seq1.seq, seq1.len);
	init_col(&res, 1);
	init_col(&rev_res, 1);
	bld_table(&he);
	init_splice_junctions();

	count = 0;
	while (!count \|\| get_next_seq(&seq2, 0, 0) == 0) {
	unsigned int curRes;
	strncpy(rna_seq_head, seq2.header, 256);
	++count;

	switch (options.reverse) {
	case 0:
	SIM4(&he, &seq2, &res);
	break;
	case 2:
	SIM4(&he, &seq2, &res);
	case 1:
	seq_revcomp_inplace(&seq2);
	SIM4(&he, &seq2, &rev_res);
	break;
	default:
	fatal ("Unrecognized request for EST orientation.\n");
	}
	/* Keep only the best matches, according to filterPct. */
	if (options.filterPct > 0) {
	unsigned int max_nmatches = 0;
	for (curRes = 0; curRes < rev_res.nb; curRes++) {
	result_p_t r = rev_res.e.result[curRes];
	if (r->st.nmatches > max_nmatches)
	max_nmatches = r->st.nmatches;
	}
	for (curRes = 0; curRes < res.nb; curRes++) {
	result_p_t r = res.e.result[curRes];
	if (r->st.nmatches > max_nmatches)
	max_nmatches = r->st.nmatches;
	}
	max_nmatches = (max_nmatches * options.filterPct) / 100;
	for (curRes = 0; curRes < rev_res.nb; curRes++) {
	result_p_t r = rev_res.e.result[curRes];
	if (r->st.nmatches < max_nmatches)
	r->st.nmatches = 0;
	}
	for (curRes = 0; curRes < res.nb; curRes++) {
	result_p_t r = res.e.result[curRes];
	if (r->st.nmatches < max_nmatches)
	r->st.nmatches = 0;
	}
	}
	/* Now, print results. */
	for (curRes = 0; curRes < rev_res.nb; curRes++)
	print_res(rev_res.e.result[curRes], 1, &seq1, &seq2);
	rev_res.nb = 0;
	if (options.reverse && options.ali_flag)
	/* reverse-complement back seq2 for alignment */
	seq_revcomp_inplace(&seq2);
	for (curRes = 0; curRes < res.nb; curRes++)
	print_res(res.e.result[curRes], 0, &seq1, &seq2);
	res.nb = 0;
	}
	#ifdef DEBUG
	fprintf(stderr, "DEBUG mode: freeing all memory...\n");
	fflush(stdout);
	fflush(stderr);
	free_hash_env(&he);
	free_seq(&seq1);
	free_seq(&seq2);
	free(options.splice);
	free(res.e.elt);
	free(rev_res.e.elt);
	#endif
	return 0;
	}

	static const unsigned char dna_complement[256] =
	" "
	" TVGH CD M KN YSA BWXR tvgh cd m kn ysa bwxr "
	" "
	" ";
	/* ................................................................ */
	/* @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{\|}~. */
	/* ................................................................ */
	/* ................................................................ */

	static void
	init_splice_junctions(void)
	{
	unsigned int i;
	options.splice = (junction_p_t) xmalloc(options.nbSplice
	* sizeof(junction_t));
	for (i = 0; i < options.nbSplice; i++) {
	unsigned int j;
	for (j = 0; j < 4; j++) {
	uchar c = options.splice_type_list[i * 5 + j];
	options.splice[i].fwd[j] = c;
	options.splice[i].rev[3 - j] = dna_complement[c];
	}
	}
	#ifdef DEBUG
	for (i = 0; i < options.nbSplice; i++)
	fprintf(stderr, "Splice[%u]: %.4s %.4s\n",
	i, options.splice[i].fwd, options.splice[i].rev);
	#endif
	}

	static void
	print_res(result_p_t res, int rev, seq_p_t seq1, seq_p_t seq2)
	{
	unsigned int i;
	if (res->st.nmatches >= options.minScore_cutoff) {
	printf("\n%s%s\n", seq1->header, seq2->header);
	if (rev)
	printf("(complement)\n\n");
	switch (options.ali_flag) {
	case 0:
	print_exons(&res->eCol, res->direction);
	break;
	case 1:
	print_align_lat(seq1->seq, seq2->seq, res);
	break;
	case 3:
	print_exons(&res->eCol, res->direction);
	print_align_lat(seq1->seq, seq2->seq, res);
	break;
	case 4:
	print_exons(&res->eCol, res->direction);
	print_polyA_info(seq1, seq2, &res->eCol, &res->st);
	print_align_lat(seq1->seq, seq2->seq, res);
	break;
	default:
	fatal("Unrecognized option for alignment output.\n");
	}
	printf("\n");
	}
	for (i = 0; i < res->eCol.nb; i++)
	free(res->eCol.e.elt[i]);
	free(res->eCol.e.elt);
	if (res->sList)
	free_align(res->sList);
	free(res);
	}

	static void
	print_polyA_info(seq_p_t s1, seq_p_t s2, collec_p_t eCol,
	sim4_stats_p_t st)
	{
	if (st->polyA_cut) {
	unsigned int cnt = 0, cntDna = 0, pos, i, scanLen = 50;
	char *pSig, buf[51];
	exon_p_t e = eCol->e.exon[eCol->nb - 1];
	for (pos = 0; pos < 10 && e->to2 + pos < s2->len; pos++)
	if (s2->seq[e->to2 + pos] == 'A')
	cnt += 1;
	while (e->to2 + pos < s2->len && s2->seq[e->to2 + pos] == 'A') {
	pos += 1;
	cnt += 1;
	}
	for (i = 0; i < s1->len && i < pos; i++)
	if (s1->seq[e->to1 + i] == 'A')
	cntDna += 1;
	printf("\nPolyA site %u nt, %u/%u A's %u\n R %.s %u\n D %.*s %u\n",
	pos, cnt, cntDna, e->to1 + 1 + options.dnaOffset,
	pos, s2->seq + e->to2, e->to2 + 1,
	pos, i, s1->seq + e->to1, e->to1 + 1 + options.dnaOffset);
	if (e->to1 < scanLen)
	scanLen = e->to1;
	strncpy(buf, (char *) s1->seq + e->to1 - scanLen, scanLen);
	buf[scanLen] = 0;
	pSig = strstr(buf, "AATAAA");
	if (pSig == NULL)
	pSig = strstr(buf, "ATTAAA");
	if (pSig != NULL)
	printf("PolyA signal %u\n",
	(unsigned int) (pSig - buf + e->to1 - scanLen + 1
	+ options.dnaOffset));
	}
	if (st->polyT_cut) {
	unsigned int cnt = 0, cntDna = 0, pos, i;
	char *pSig, buf[51];
	exon_p_t e = eCol->e.exon[0];
	for (pos = 0; pos < 10 && pos < e->from2 - 1; pos++)
	if (s2->seq[e->from2 - 2 - pos] == 'T')
	cnt += 1;
	while (pos < e->from2 - 1 && s2->seq[e->from2 - 2 - pos] == 'T') {
	pos += 1;
	cnt += 1;
	}
	for (i = 0; i < e->from1 - 1 && i < pos; i++)
	if (s1->seq[e->from1 - 2 - i] == 'T')
	cntDna += 1;
	printf("\nPolyA site %u nt, %u/%u A's %u minus strand\n R %.s %u\n D %.*s %u\n",
	pos, cnt, cntDna, e->from1 - 1 + options.dnaOffset,
	pos, s2->seq + (e->from2 - 1 - pos), e->from2 - 1,
	pos, i, s1->seq + (e->from1 - 1 - i),
	e->from1 - 1 + options.dnaOffset);
	strncpy(buf, (char *) s1->seq + e->from1 - 1, 50);
	buf[50] = 0;
	pSig = strstr(buf, "TTTATT");
	if (pSig == NULL)
	pSig = strstr(buf, "TTTAAT");
	if (pSig != NULL)
	printf("PolyA signal %u minus strand\n",
	(unsigned int) (pSig - buf + e->from1 + 5 + options.dnaOffset));
	}
	}

	static void
	print_align_lat(uchar seq1, uchar seq2, result_p_t r)
	{
	int *S;
	edit_script_list_p_t head, aligns;

	if (r->sList == NULL)
	return;
	aligns = r->sList;
	while (aligns != NULL) {
	head = aligns;
	aligns = aligns->next_script;
	S = (int ) xmalloc((2 head->len2 + 1 + 1) * sizeof(int));
	S++;
	S2A(head->script, S, 0);
	Free_script(head->script);
	IDISPLAY(seq1 + head->offset1 - 1 - 1, seq2 + head->offset2 - 1 - 1,
	head->len1, head->len2, S,
	head->offset1, head->offset2, &r->eCol, r->direction);
	free(S - 1);
	free(head);
	}
	r->sList = NULL;
	}

	#ifdef DEBUG
	static void
	free_buf(read_buf_p_t b)
	{
	free(b->line);
	}

	static void
	free_seq(seq_p_t sp)
	{
	free(sp->seq);
	free(sp->header);
	free_buf(&sp->rb);
	if (sp->fName != NULL)
	close(sp->fd);
	}
	#endif

	static void
	grow_read_buf(read_buf_p_t b)
	{
	b->lmax += BUF_SIZE;
	b->line = xrealloc(b->line, b->lmax * sizeof(char));
	}

	static char *
	shuffle_line(read_buf_p_t b, size_t *cur)
	{
	if (b->ic == 0 \|\| *cur >= b->ic)
	return NULL;
	/* Make sure we have enough room in line. */
	if (b->lmax <= b->lc + (b->ic - *cur))
	grow_read_buf(b);
	while (cur < b->ic && b->in[cur] != '\n')
	b->line[b->lc++] = b->in[(*cur)++];
	if (*cur < b->ic) {
	/* Ok, we have our string. */
	/* Copy the newline. */
	b->line[b->lc++] = b->in[(*cur)++];
	/* We should be fine, since we read BUF_SIZE -1 at most... */
	b->line[b->lc] = 0;
	/* Adjust the input buffer. */
	if (*cur < b->ic) {
	memmove(b->in, b->in + cur, (b->ic - cur) * sizeof(char));
	b->ic -= *cur;
	} else
	b->ic = 0;
	*cur = 0;
	return b->line;
	}
	/* Go read some more. */
	b->ic = 0, *cur = 0;
	return NULL;
	}

	static char *
	read_line_buf(read_buf_p_t b, int fd)
	{
	char *s = NULL;
	ssize_t rc;
	size_t cur = 0;
	b->lc = 0;
	if ((s = shuffle_line(b, &cur)) != NULL)
	return s;
	do {
	if ((rc = read(fd, b->in + b->ic, BUF_SIZE - b->ic - 1)) == -1) {
	if (errno != EINTR)
	fatal("Could not read from %d: %s(%d)\n",
	fd, strerror(errno), errno);
	} else
	b->ic += rc;
	s = shuffle_line(b, &cur);
	if (s == NULL && rc == 0) {
	/* Got to the EOF... */
	b->line[b->lc] = 0;
	s = b->line;
	}
	} while (s == NULL);
	return s;
	}

	static void
	init_buf(read_buf_p_t b)
	{
	b->line = xmalloc(BUF_SIZE * sizeof(char));
	b->lmax = BUF_SIZE;
	b->lc = 0;
	b->ic = 0;
	}

	static void
	init_seq(const char *fName, seq_p_t sp)
	{
	sp->fName = fName;
	sp->header = NULL;
	sp->seq = NULL;
	init_buf(&sp->rb);
	if (fName != NULL) {
	sp->fd = open(fName, O_RDONLY);
	if (sp->fd == -1)
	fatal("Could not open file %s: %s(%d)\n",
	fName, strerror(errno), errno);
	} else
	sp->fd = 0;
	sp->len = 0;
	sp->maxHead = 0;
	sp->max = 0;
	read_line_buf(&sp->rb, sp->fd);
	}

	static int
	get_next_seq(seq_p_t sp, unsigned int offset, int warnMultiSeq)
	{
	const int lenStr = 24;
	unsigned int headerLen;
	char *buf = sp->rb.line;
	int res;
	while (sp->rb.lc > 0 && buf[0] != '>')
	buf = read_line_buf(&sp->rb, sp->fd);
	if (sp->rb.lc == 0)
	return -1;
	/* We have the FASTA header. */
	if (sp->rb.lc + lenStr + 1 > sp->maxHead) {
	sp->maxHead = sp->rb.lc + lenStr + 1;
	sp->header = (char ) xrealloc(sp->header, sp->maxHead sizeof(char));
	}
	headerLen = sp->rb.lc;
	memcpy(sp->header, buf, (sp->rb.lc + 1) * sizeof(char));
	sp->len = 0;
	buf = read_line_buf(&sp->rb, sp->fd);
	while (sp->rb.lc > 0 && buf[0] != '>') {
	unsigned char c;
	/* Make sure we have enough room for this additional line. */
	if (sp->len + sp->rb.lc + 1 > sp->max) {
	sp->max = max(sp->len + sp->rb.lc + 1,
	sp->max + 0x40000);
	sp->seq = (unsigned char *)
	xrealloc(sp->seq, sp->max * sizeof(unsigned char));
	}
	while ((c = *buf++) != 0) {
	if (isupper(c)) {
	sp->seq[sp->len++] = c;
	} else if (islower(c)) {
	sp->seq[sp->len++] = toupper(c);
	}
	}
	buf = read_line_buf(&sp->rb, sp->fd);
	}
	if (warnMultiSeq && sp->rb.lc > 0)
	fprintf(stderr, "\n"
	"* WARNING *\n"
	"* there appears to be several sequences in the DNA *\n"
	"* sequence file. Only the first one will be used, *\n"
	"* which might not be what was intended. *\n"
	"\n");
	sp->seq[sp->len] = 0;
	buf = strstr(sp->header, "; LEN=");
	if (buf) {
	char *s = buf + 6;
	headerLen -= 6;
	while (isdigit(*s)) {
	s += 1;
	headerLen -= 1;
	}
	while (*s)
	buf++ = s++;
	}
	buf = sp->header + headerLen - 1;
	while (iscntrl(buf) \|\| isspace(buf))
	buf -= 1;
	res = snprintf(buf + 1, lenStr, "; LEN=%u\n", sp->len + offset);
	if (res < 0 \|\| res >= lenStr)
	fatal("Sequence too long: %u\n", sp->len);
	return 0;
	}

	static void
	seq_revcomp_inplace(seq_p_t seq)
	{
	unsigned char *s = seq->seq;
	unsigned char *t = seq->seq + seq->len;
	unsigned char c;
	while (s < t) {
	c = dna_complement[*--t];
	t = dna_complement[s];
	*s++ = c;
	}
	}

	static void
	bug_handler(int signum)
	{
	fflush(stdout);
	fflush(stderr);
	fprintf(stderr, "\nCaught signal %d while processing:\n%.256s\n%.256s\n",
	signum, dna_seq_head, rna_seq_head);
	abort();
	}