Skip to content

Commit cdb3245

Browse files
pd3whitwham
authored andcommitted
Automatically recognize BED vs TSV files and add the option -C, --coords
The program was working with 1-based coordinates only. In this commit we make it automatically recognize BED files by .bed, .bed.gz suffix (case-insensitive). The new option -C, --coords allows to override the program's decision
1 parent 386de25 commit cdb3245

File tree

6 files changed

+51
-4
lines changed

6 files changed

+51
-4
lines changed

annot-tsv.c

+41-3
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
Copyright (C) 2018-2024 Genome Research Ltd.
2+
Copyright (C) 2018-2025 Genome Research Ltd.
33
44
Author: Petr Danecek <[email protected]>
55
@@ -72,6 +72,7 @@ typedef struct
7272
cols_t *core, *match, *transfer, *annots;
7373
int *core_idx, *match_idx, *transfer_idx, *annots_idx;
7474
int *nannots_added; // for --max-annots: the number of annotations added
75+
int coor_base[2]; // 0 or 1-indexed beg,end?
7576
char delim;
7677
int grow_n;
7778
kstring_t line; // one buffered line, a byproduct of reading the header
@@ -102,7 +103,7 @@ typedef struct
102103
{
103104
nbp_t *nbp;
104105
dat_t dst, src;
105-
char *core_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
106+
char *core_str, *coords_str, *match_str, *transfer_str, *annots_str, *headers_str, *delim_str;
106107
char *temp_dir, *out_fname;
107108
BGZF *out_fp;
108109
int allow_dups, max_annots, mode, no_write_hdr, overlap_either;
@@ -301,6 +302,13 @@ int parse_tab_with_payload(const char *line, char **chr_beg, char **chr_end, hts
301302
*end = strtod(ptr, &tmp);
302303
if ( tmp==ptr ) error("Expected numeric value, found \"%s\": %s\n",ptr,line);
303304

305+
// NB: for indexing we leave the coordinates 1-based when 1-based, otherwise 0-coordinate would underflow. This
306+
// means the biggest hts coordinate will overflow, which is less common than the 0 underflow. This does not affect the output,
307+
// only indexing.
308+
// The following code will make beg+=1 for BED file (-C 01)
309+
(*beg) -= dat->coor_base[0] - 1;
310+
(*end) -= dat->coor_base[1] - 1;
311+
304312
if ( *end < *beg )
305313
{
306314
if ( !beg_end_warned )
@@ -484,6 +492,26 @@ void sanity_check_columns(char *fname, hdr_t *hdr, cols_t *cols, int **col2idx,
484492
(*col2idx)[i] = idx;
485493
}
486494
}
495+
void parse_coor_base(args_t *args, char *str, dat_t *dat)
496+
{
497+
int len = strlen(dat->fname);
498+
int beg = 1, end = 1;
499+
if ( *str )
500+
{
501+
if ( str[0]=='0' ) beg = 0;
502+
else if ( str[0]=='1' ) beg = 1;
503+
else error("Could not parse: --coords %s\n",args->coords_str);
504+
505+
if ( str[1]=='0' ) end = 0;
506+
else if ( str[1]=='1' ) end = 1;
507+
else error("Could not parse: --coords %s\n",args->coords_str);
508+
}
509+
else if ( len>=4 && !strcasecmp(".bed",dat->fname+len-4) ) beg = 0;
510+
else if ( len>=7 && !strcasecmp(".bed.gz",dat->fname+len-7) ) beg = 0;
511+
dat->coor_base[0] = beg;
512+
dat->coor_base[1] = end;
513+
}
514+
487515
void init_data(args_t *args)
488516
{
489517
if ( !args->delim_str )
@@ -521,6 +549,13 @@ void init_data(args_t *args)
521549
if ( args->src.core->n!=3 || args->dst.core->n!=3 ) error("Expected three columns: %s\n", args->core_str);
522550
cols_destroy(tmp);
523551

552+
// -C, --coordinates, 0 or 1-based
553+
if ( !args->coords_str ) args->coords_str = ":";
554+
tmp = cols_split(args->coords_str, NULL, ':');
555+
parse_coor_base(args, tmp->off[0], &args->src);
556+
parse_coor_base(args, tmp->n==2 ? tmp->off[1] : tmp->off[0], &args->dst);
557+
cols_destroy(tmp);
558+
524559
// -m, match columns
525560
if ( args->match_str )
526561
{
@@ -881,6 +916,7 @@ static const char *usage_text(void)
881916
" frac .. fraction of the target region with an\n"
882917
" overlap\n"
883918
" nbp .. number of source base pairs in the overlap\n"
919+
" -C, --coords SRC:TGT Are coordinates 0 or 1-based, BED=01, TSV=11 [11]\n"
884920
" -d, --delim SRC:TGT Column delimiter in SRC and TGT file\n"
885921
" -h, --headers SRC:TGT Header row line number, 0:0 is equivalent to -H, negative\n"
886922
" value counts from the end of comment line block [1:1]\n"
@@ -923,6 +959,7 @@ int main(int argc, char **argv)
923959
static struct option loptions[] =
924960
{
925961
{"core",required_argument,NULL,'c'},
962+
{"coords",required_argument,NULL,'C'},
926963
{"transfer",required_argument,NULL,'f'},
927964
{"match",required_argument,NULL,'m'},
928965
{"output",required_argument,NULL,'o'},
@@ -945,7 +982,7 @@ int main(int argc, char **argv)
945982
char *tmp = NULL;
946983
int c;
947984
int reciprocal = 0;
948-
while ((c = getopt_long(argc, argv, "c:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
985+
while ((c = getopt_long(argc, argv, "c:C:f:m:o:s:t:a:HO:rxh:Id:",loptions,NULL)) >= 0)
949986
{
950987
switch (c)
951988
{
@@ -966,6 +1003,7 @@ int main(int argc, char **argv)
9661003
case 'H': args->headers_str = "0:0"; break;
9671004
case 'r': reciprocal = 1; break;
9681005
case 'c': args->core_str = optarg; break;
1006+
case 'C': args->coords_str = optarg; break;
9691007
case 't': args->dst.fname = optarg; break;
9701008
case 'm': args->match_str = optarg; break;
9711009
case 'a': args->annots_str = optarg; break;

test/annot-tsv/dst.14.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1 1 2
2+
1 6 7

test/annot-tsv/out.14.1.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1 1 2
2+
1 6 7

test/annot-tsv/out.14.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
1 1 2

test/annot-tsv/src.14.txt

+2
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
1 1 2
2+
1 5 6

test/test.pl

+3-1
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#!/usr/bin/env perl
22
#
3-
# Copyright (C) 2012-2024 Genome Research Ltd.
3+
# Copyright (C) 2012-2025 Genome Research Ltd.
44
#
55
# Author: Petr Danecek <[email protected]>
66
#
@@ -1515,4 +1515,6 @@ sub test_annot_tsv
15151515
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.2.txt',args=>q[-c 1,2,3 -f 4:5 -O 0.5,0.5]);
15161516
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.3.txt',args=>q[-c 1,2,3 -f 4:5 -O 0,1]);
15171517
run_annot_tsv($opts,src=>'src.13.txt',dst=>'src.13.txt',out=>'out.13.4.txt',args=>q[-c 1,2,3 -f 4:5 -O 1,0]);
1518+
run_annot_tsv($opts,src=>'src.14.txt',dst=>'dst.14.txt',out=>'out.14.1.txt',args=>q[-c 1,2,3 -C 11:11]); # 1-based coordinates
1519+
run_annot_tsv($opts,src=>'src.14.txt',dst=>'dst.14.txt',out=>'out.14.2.txt',args=>q[-c 1,2,3 -C 01:01]); # interpret as bed coordinates
15181520
}

0 commit comments

Comments
 (0)