1
1
/*
2
- Copyright (C) 2018-2024 Genome Research Ltd.
2
+ Copyright (C) 2018-2025 Genome Research Ltd.
3
3
4
4
Author: Petr Danecek <[email protected] >
5
5
@@ -72,6 +72,7 @@ typedef struct
72
72
cols_t * core , * match , * transfer , * annots ;
73
73
int * core_idx , * match_idx , * transfer_idx , * annots_idx ;
74
74
int * nannots_added ; // for --max-annots: the number of annotations added
75
+ int coor_base [2 ]; // 0 or 1-indexed beg,end?
75
76
char delim ;
76
77
int grow_n ;
77
78
kstring_t line ; // one buffered line, a byproduct of reading the header
@@ -102,7 +103,7 @@ typedef struct
102
103
{
103
104
nbp_t * nbp ;
104
105
dat_t dst , src ;
105
- char * core_str , * match_str , * transfer_str , * annots_str , * headers_str , * delim_str ;
106
+ char * core_str , * coords_str , * match_str , * transfer_str , * annots_str , * headers_str , * delim_str ;
106
107
char * temp_dir , * out_fname ;
107
108
BGZF * out_fp ;
108
109
int allow_dups , max_annots , mode , no_write_hdr , overlap_either ;
@@ -301,6 +302,13 @@ int parse_tab_with_payload(const char *line, char **chr_beg, char **chr_end, hts
301
302
* end = strtod (ptr , & tmp );
302
303
if ( tmp == ptr ) error ("Expected numeric value, found \"%s\": %s\n" ,ptr ,line );
303
304
305
+ // NB: for indexing we leave the coordinates 1-based when 1-based, otherwise 0-coordinate would underflow. This
306
+ // means the biggest hts coordinate will overflow, which is less common than the 0 underflow. This does not affect the output,
307
+ // only indexing.
308
+ // The following code will make beg+=1 for BED file (-C 01)
309
+ (* beg ) -= dat -> coor_base [0 ] - 1 ;
310
+ (* end ) -= dat -> coor_base [1 ] - 1 ;
311
+
304
312
if ( * end < * beg )
305
313
{
306
314
if ( !beg_end_warned )
@@ -484,6 +492,26 @@ void sanity_check_columns(char *fname, hdr_t *hdr, cols_t *cols, int **col2idx,
484
492
(* col2idx )[i ] = idx ;
485
493
}
486
494
}
495
+ void parse_coor_base (args_t * args , char * str , dat_t * dat )
496
+ {
497
+ int len = strlen (dat -> fname );
498
+ int beg = 1 , end = 1 ;
499
+ if ( * str )
500
+ {
501
+ if ( str [0 ]== '0' ) beg = 0 ;
502
+ else if ( str [0 ]== '1' ) beg = 1 ;
503
+ else error ("Could not parse: --coords %s\n" ,args -> coords_str );
504
+
505
+ if ( str [1 ]== '0' ) end = 0 ;
506
+ else if ( str [1 ]== '1' ) end = 1 ;
507
+ else error ("Could not parse: --coords %s\n" ,args -> coords_str );
508
+ }
509
+ else if ( len >=4 && !strcasecmp (".bed" ,dat -> fname + len - 4 ) ) beg = 0 ;
510
+ else if ( len >=7 && !strcasecmp (".bed.gz" ,dat -> fname + len - 7 ) ) beg = 0 ;
511
+ dat -> coor_base [0 ] = beg ;
512
+ dat -> coor_base [1 ] = end ;
513
+ }
514
+
487
515
void init_data (args_t * args )
488
516
{
489
517
if ( !args -> delim_str )
@@ -521,6 +549,13 @@ void init_data(args_t *args)
521
549
if ( args -> src .core -> n != 3 || args -> dst .core -> n != 3 ) error ("Expected three columns: %s\n" , args -> core_str );
522
550
cols_destroy (tmp );
523
551
552
+ // -C, --coordinates, 0 or 1-based
553
+ if ( !args -> coords_str ) args -> coords_str = ":" ;
554
+ tmp = cols_split (args -> coords_str , NULL , ':' );
555
+ parse_coor_base (args , tmp -> off [0 ], & args -> src );
556
+ parse_coor_base (args , tmp -> n == 2 ? tmp -> off [1 ] : tmp -> off [0 ], & args -> dst );
557
+ cols_destroy (tmp );
558
+
524
559
// -m, match columns
525
560
if ( args -> match_str )
526
561
{
@@ -881,6 +916,7 @@ static const char *usage_text(void)
881
916
" frac .. fraction of the target region with an\n"
882
917
" overlap\n"
883
918
" nbp .. number of source base pairs in the overlap\n"
919
+ " -C, --coords SRC:TGT Are coordinates 0 or 1-based, BED=01, TSV=11 [11]\n"
884
920
" -d, --delim SRC:TGT Column delimiter in SRC and TGT file\n"
885
921
" -h, --headers SRC:TGT Header row line number, 0:0 is equivalent to -H, negative\n"
886
922
" value counts from the end of comment line block [1:1]\n"
@@ -923,6 +959,7 @@ int main(int argc, char **argv)
923
959
static struct option loptions [] =
924
960
{
925
961
{"core" ,required_argument ,NULL ,'c' },
962
+ {"coords" ,required_argument ,NULL ,'C' },
926
963
{"transfer" ,required_argument ,NULL ,'f' },
927
964
{"match" ,required_argument ,NULL ,'m' },
928
965
{"output" ,required_argument ,NULL ,'o' },
@@ -945,7 +982,7 @@ int main(int argc, char **argv)
945
982
char * tmp = NULL ;
946
983
int c ;
947
984
int reciprocal = 0 ;
948
- while ((c = getopt_long (argc , argv , "c:f:m:o:s:t:a:HO:rxh:Id:" ,loptions ,NULL )) >= 0 )
985
+ while ((c = getopt_long (argc , argv , "c:C: f:m:o:s:t:a:HO:rxh:Id:" ,loptions ,NULL )) >= 0 )
949
986
{
950
987
switch (c )
951
988
{
@@ -966,6 +1003,7 @@ int main(int argc, char **argv)
966
1003
case 'H' : args -> headers_str = "0:0" ; break ;
967
1004
case 'r' : reciprocal = 1 ; break ;
968
1005
case 'c' : args -> core_str = optarg ; break ;
1006
+ case 'C' : args -> coords_str = optarg ; break ;
969
1007
case 't' : args -> dst .fname = optarg ; break ;
970
1008
case 'm' : args -> match_str = optarg ; break ;
971
1009
case 'a' : args -> annots_str = optarg ; break ;
0 commit comments