Skip to content

Commit 99954c1

Browse files
committed
Add bam_aux_first()/bam_aux_next() tagged aux field iterator API
Add new API functions for iterating through a BAM record's aux fields, inline accessor methods for field tag and type (or code can continue to use s-2 and *s), and a variant of bam_aux_del() that returns the (updated) iterator to the following field (for use in iterator-based loops that delete fields). Add test cases for the new API functions.
1 parent 9bcb2d2 commit 99954c1

File tree

3 files changed

+137
-33
lines changed

3 files changed

+137
-33
lines changed

htslib/sam.h

+52-1
Original file line numberDiff line numberDiff line change
@@ -1437,7 +1437,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b,
14371437

14381438
/// Converts a BAM aux tag to SAM format
14391439
/*
1440-
* @param b Pointer to the bam record
14411440
* @param key Two letter tag key
14421441
* @param type Single letter type code: ACcSsIifHZB.
14431442
* @param tag Tag data pointer, in BAM format
@@ -1627,6 +1626,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
16271626
return NULL;
16281627
}
16291628

1629+
/// Return a pointer to a BAM record's first aux field
1630+
/** @param b Pointer to the BAM record
1631+
@return Aux field pointer, or NULL if the record has none
1632+
1633+
When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers"
1634+
point to the TYPE byte within the auxiliary data for that field; but in general
1635+
it is unnecessary for user code to be aware of this.)
1636+
*/
1637+
HTSLIB_EXPORT
1638+
uint8_t *bam_aux_first(const bam1_t *b);
1639+
1640+
/// Return a pointer to a BAM record's next aux field
1641+
/** @param b Pointer to the BAM record
1642+
@param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
1643+
@return Pointer to the next aux field, or NULL if no next field or error
1644+
1645+
Whenever NULL is returned, errno will also be set: ENOENT if @p s was the
1646+
record's last aux field; otherwise EINVAL, indicating that the BAM record's
1647+
aux data is corrupt.
1648+
*/
1649+
HTSLIB_EXPORT
1650+
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s);
1651+
16301652
/// Return a pointer to an aux record
16311653
/** @param b Pointer to the bam record
16321654
@param tag Desired aux tag
@@ -1639,6 +1661,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
16391661
HTSLIB_EXPORT
16401662
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
16411663

1664+
/// Return the aux field's 2-character tag
1665+
/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
1666+
@return Pointer to the tag characters, NOT NUL-terminated
1667+
*/
1668+
static inline
1669+
const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); }
1670+
1671+
/// Return the aux field's type character
1672+
/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
1673+
@return The type character: one of cCsSiI/fd/A/Z/H/B
1674+
*/
1675+
static inline char bam_aux_type(const uint8_t *s) { return *s; }
1676+
16421677
/// Return a SAM formatting string containing a BAM tag
16431678
/** @param b Pointer to the bam record
16441679
@param tag Desired aux tag
@@ -1750,6 +1785,22 @@ int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8
17501785
HTSLIB_EXPORT
17511786
int bam_aux_del(bam1_t *b, uint8_t *s);
17521787

1788+
/// Delete an aux field from a BAM record
1789+
/* @param b The BAM record to update
1790+
@param s Pointer to the aux field to delete, as returned by
1791+
bam_aux_first()/_next()/_get()
1792+
@return Pointer to the following aux field, or NULL if none or on error
1793+
1794+
Identical to @c bam_aux_del() apart from the return value, which is an
1795+
aux iterator suitable for use with @c bam_aux_next()/etc.
1796+
1797+
Whenever NULL is returned, errno will also be set: ENOENT if the aux field
1798+
deleted was the record's last one; otherwise EINVAL, indicating that the
1799+
BAM record's aux data is corrupt.
1800+
*/
1801+
HTSLIB_EXPORT
1802+
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s);
1803+
17531804
/// Update or add a string-type tag
17541805
/* @param b The bam record to update
17551806
@param tag Tag identifier

sam.c

+46-32
Original file line numberDiff line numberDiff line change
@@ -4581,31 +4581,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
45814581
}
45824582
}
45834583

4584+
uint8_t *bam_aux_first(const bam1_t *b)
4585+
{
4586+
uint8_t *s = bam_get_aux(b);
4587+
uint8_t *end = b->data + b->l_data;
4588+
if (s >= end) { errno = ENOENT; return NULL; }
4589+
return s+2;
4590+
}
4591+
4592+
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4593+
{
4594+
uint8_t *end = b->data + b->l_data;
4595+
uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4596+
if (next == NULL) goto bad_aux;
4597+
if (next >= end) { errno = ENOENT; return NULL; }
4598+
return next+2;
4599+
4600+
bad_aux:
4601+
hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4602+
errno = EINVAL;
4603+
return NULL;
4604+
}
4605+
45844606
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
45854607
{
4586-
uint8_t *s, *end, *t = (uint8_t *) tag;
4587-
uint16_t y = (uint16_t) t[0]<<8 | t[1];
4588-
s = bam_get_aux(b);
4589-
end = b->data + b->l_data;
4590-
while (s != NULL && end - s >= 3) {
4591-
uint16_t x = (uint16_t) s[0]<<8 | s[1];
4592-
s += 2;
4593-
if (x == y) {
4608+
uint8_t *s;
4609+
for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4610+
if (s[-2] == tag[0] && s[-1] == tag[1]) {
45944611
// Check the tag value is valid and complete
4595-
uint8_t *e = skip_aux(s, end);
4596-
if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
4597-
goto bad_aux; // Unterminated string
4598-
}
4599-
if (e != NULL) {
4600-
return s;
4601-
} else {
4602-
goto bad_aux;
4603-
}
4612+
uint8_t *e = skip_aux(s, b->data + b->l_data);
4613+
if (e == NULL) goto bad_aux;
4614+
if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4615+
4616+
return s;
46044617
}
4605-
s = skip_aux(s, end);
4606-
}
4607-
if (s == NULL) goto bad_aux;
4608-
errno = ENOENT;
4618+
4619+
// errno now as set by bam_aux_first()/bam_aux_next()
46094620
return NULL;
46104621

46114622
bad_aux:
@@ -4614,23 +4625,26 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
46144625
return NULL;
46154626
}
46164627

4617-
// s MUST BE returned by bam_aux_get()
46184628
int bam_aux_del(bam1_t *b, uint8_t *s)
46194629
{
4620-
uint8_t *p, *aux;
4621-
int l_aux = bam_get_l_aux(b);
4622-
aux = bam_get_aux(b);
4623-
p = s - 2;
4624-
s = skip_aux(s, aux + l_aux);
4625-
if (s == NULL) goto bad_aux;
4626-
memmove(p, s, l_aux - (s - aux));
4627-
b->l_data -= s - p;
4628-
return 0;
4630+
s = bam_aux_remove(b, s);
4631+
return (s || errno == ENOENT)? 0 : -1;
4632+
}
4633+
4634+
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4635+
{
4636+
uint8_t *end = b->data + b->l_data;
4637+
uint8_t *next = skip_aux(s, end);
4638+
if (next == NULL) goto bad_aux;
4639+
memmove(s-2, next, end - next);
4640+
b->l_data -= next - (s-2);
4641+
if (next >= end) { errno = ENOENT; return NULL; }
4642+
else return s;
46294643

46304644
bad_aux:
46314645
hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
46324646
errno = EINVAL;
4633-
return -1;
4647+
return NULL;
46344648
}
46354649

46364650
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)

test/sam.c

+39
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type)
8787
return NULL;
8888
}
8989

90+
static void check_aux_count(const bam1_t *aln, int expected, const char *what)
91+
{
92+
const uint8_t *itr;
93+
int n = 0;
94+
for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++;
95+
if (n != expected)
96+
fail("%s has %d aux fields, expected %d", what, n, expected);
97+
}
98+
9099
static void check_int_B_array(bam1_t *aln, char *tag,
91100
uint32_t nvals, int64_t *vals) {
92101
uint8_t *p;
@@ -285,10 +294,30 @@ static int aux_fields1(void)
285294
if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
286295
fail("XA field is '%c', expected 'k'", bam_aux2A(p));
287296

297+
check_aux_count(aln, 24, "Original record");
298+
288299
bam_aux_del(aln,p);
289300
if (bam_aux_get(aln,"XA"))
290301
fail("XA field was not deleted");
291302

303+
check_aux_count(aln, 23, "Record post-XA-deletion");
304+
305+
p = bam_aux_get(aln, "Y2");
306+
if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i')
307+
fail("bam_aux_get() missed Y2 field");
308+
309+
p = bam_aux_next(aln, p);
310+
if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c')
311+
fail("bam_aux_next() missed Y3 field");
312+
313+
p = bam_aux_get(aln, "Y8");
314+
if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I')
315+
fail("bam_aux_get() missed Y8 field");
316+
317+
p = bam_aux_next(aln, p);
318+
if (p != NULL || errno != ENOENT)
319+
fail("bam_aux_next missed the end of fields");
320+
292321
if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
293322
fail("Xi field is %"PRId64", expected 37", bam_aux2i(p));
294323

@@ -492,6 +521,16 @@ static int aux_fields1(void)
492521

493522
if (strcmp(ks.s, r1) != 0)
494523
fail("record formatted incorrectly: \"%s\"", ks.s);
524+
525+
// Test field removal APIs -- after the strcmp(..., r1) check so that
526+
// can also check the formatting of the to-be-removed fields.
527+
528+
p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H'));
529+
if (bam_aux_get(aln, "XH"))
530+
fail("XH field was not removed");
531+
check_aux_count(aln, 31, "Record post-XH-removal");
532+
if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B')
533+
fail("bam_aux_remove() missed XB field");
495534
}
496535
else fail("can't read record");
497536

0 commit comments

Comments
 (0)