Skip to content

Commit ac3beaa

Browse files
committed
Add bam_aux_first()/bam_aux_next() tagged aux field iterator API
Add new API functions for iterating through a BAM record's aux fields, inline accessor methods for field tag and type (or code can continue to use s-2 and *s), and a variant of bam_aux_del() that returns the (updated) iterator to the following field (for use in iterator-based loops that delete fields). Add test cases for the new API functions.
1 parent 203f5bb commit ac3beaa

File tree

3 files changed

+137
-33
lines changed

3 files changed

+137
-33
lines changed

htslib/sam.h

+52-1
Original file line numberDiff line numberDiff line change
@@ -1438,7 +1438,6 @@ int sam_passes_filter(const sam_hdr_t *h, const bam1_t *b,
14381438

14391439
/// Converts a BAM aux tag to SAM format
14401440
/*
1441-
* @param b Pointer to the bam record
14421441
* @param key Two letter tag key
14431442
* @param type Single letter type code: ACcSsIifHZB.
14441443
* @param tag Tag data pointer, in BAM format
@@ -1628,6 +1627,29 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
16281627
return NULL;
16291628
}
16301629

1630+
/// Return a pointer to a BAM record's first aux field
1631+
/** @param b Pointer to the BAM record
1632+
@return Aux field pointer, or NULL if the record has none
1633+
1634+
When NULL is returned, errno will also be set to ENOENT. ("Aux field pointers"
1635+
point to the TYPE byte within the auxiliary data for that field; but in general
1636+
it is unnecessary for user code to be aware of this.)
1637+
*/
1638+
HTSLIB_EXPORT
1639+
uint8_t *bam_aux_first(const bam1_t *b);
1640+
1641+
/// Return a pointer to a BAM record's next aux field
1642+
/** @param b Pointer to the BAM record
1643+
@param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
1644+
@return Pointer to the next aux field, or NULL if no next field or error
1645+
1646+
Whenever NULL is returned, errno will also be set: ENOENT if @p s was the
1647+
record's last aux field; otherwise EINVAL, indicating that the BAM record's
1648+
aux data is corrupt.
1649+
*/
1650+
HTSLIB_EXPORT
1651+
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s);
1652+
16311653
/// Return a pointer to an aux record
16321654
/** @param b Pointer to the bam record
16331655
@param tag Desired aux tag
@@ -1640,6 +1662,19 @@ static inline const uint8_t *sam_format_aux1(const uint8_t *key,
16401662
HTSLIB_EXPORT
16411663
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2]);
16421664

1665+
/// Return the aux field's 2-character tag
1666+
/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
1667+
@return Pointer to the tag characters, NOT NUL-terminated
1668+
*/
1669+
static inline
1670+
const char *bam_aux_tag(const uint8_t *s) { return (const char *) (s-2); }
1671+
1672+
/// Return the aux field's type character
1673+
/** @param s Aux field pointer, as returned by bam_aux_first()/_next()/_get()
1674+
@return The type character: one of cCsSiI/fd/A/Z/H/B
1675+
*/
1676+
static inline char bam_aux_type(const uint8_t *s) { return *s; }
1677+
16431678
/// Return a SAM formatting string containing a BAM tag
16441679
/** @param b Pointer to the bam record
16451680
@param tag Desired aux tag
@@ -1751,6 +1786,22 @@ int bam_aux_append(bam1_t *b, const char tag[2], char type, int len, const uint8
17511786
HTSLIB_EXPORT
17521787
int bam_aux_del(bam1_t *b, uint8_t *s);
17531788

1789+
/// Delete an aux field from a BAM record
1790+
/* @param b The BAM record to update
1791+
@param s Pointer to the aux field to delete, as returned by
1792+
bam_aux_first()/_next()/_get()
1793+
@return Pointer to the following aux field, or NULL if none or on error
1794+
1795+
Identical to @c bam_aux_del() apart from the return value, which is an
1796+
aux iterator suitable for use with @c bam_aux_next()/etc.
1797+
1798+
Whenever NULL is returned, errno will also be set: ENOENT if the aux field
1799+
deleted was the record's last one; otherwise EINVAL, indicating that the
1800+
BAM record's aux data is corrupt.
1801+
*/
1802+
HTSLIB_EXPORT
1803+
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s);
1804+
17541805
/// Update or add a string-type tag
17551806
/* @param b The bam record to update
17561807
@param tag Tag identifier

sam.c

+46-32
Original file line numberDiff line numberDiff line change
@@ -4614,31 +4614,42 @@ static inline uint8_t *skip_aux(uint8_t *s, uint8_t *end)
46144614
}
46154615
}
46164616

4617+
uint8_t *bam_aux_first(const bam1_t *b)
4618+
{
4619+
uint8_t *s = bam_get_aux(b);
4620+
uint8_t *end = b->data + b->l_data;
4621+
if (s >= end) { errno = ENOENT; return NULL; }
4622+
return s+2;
4623+
}
4624+
4625+
uint8_t *bam_aux_next(const bam1_t *b, const uint8_t *s)
4626+
{
4627+
uint8_t *end = b->data + b->l_data;
4628+
uint8_t *next = s? skip_aux((uint8_t *) s, end) : end;
4629+
if (next == NULL) goto bad_aux;
4630+
if (next >= end) { errno = ENOENT; return NULL; }
4631+
return next+2;
4632+
4633+
bad_aux:
4634+
hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
4635+
errno = EINVAL;
4636+
return NULL;
4637+
}
4638+
46174639
uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
46184640
{
4619-
uint8_t *s, *end, *t = (uint8_t *) tag;
4620-
uint16_t y = (uint16_t) t[0]<<8 | t[1];
4621-
s = bam_get_aux(b);
4622-
end = b->data + b->l_data;
4623-
while (s != NULL && end - s >= 3) {
4624-
uint16_t x = (uint16_t) s[0]<<8 | s[1];
4625-
s += 2;
4626-
if (x == y) {
4641+
uint8_t *s;
4642+
for (s = bam_aux_first(b); s; s = bam_aux_next(b, s))
4643+
if (s[-2] == tag[0] && s[-1] == tag[1]) {
46274644
// Check the tag value is valid and complete
4628-
uint8_t *e = skip_aux(s, end);
4629-
if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') {
4630-
goto bad_aux; // Unterminated string
4631-
}
4632-
if (e != NULL) {
4633-
return s;
4634-
} else {
4635-
goto bad_aux;
4636-
}
4645+
uint8_t *e = skip_aux(s, b->data + b->l_data);
4646+
if (e == NULL) goto bad_aux;
4647+
if ((*s == 'Z' || *s == 'H') && *(e - 1) != '\0') goto bad_aux;
4648+
4649+
return s;
46374650
}
4638-
s = skip_aux(s, end);
4639-
}
4640-
if (s == NULL) goto bad_aux;
4641-
errno = ENOENT;
4651+
4652+
// errno now as set by bam_aux_first()/bam_aux_next()
46424653
return NULL;
46434654

46444655
bad_aux:
@@ -4647,23 +4658,26 @@ uint8_t *bam_aux_get(const bam1_t *b, const char tag[2])
46474658
return NULL;
46484659
}
46494660

4650-
// s MUST BE returned by bam_aux_get()
46514661
int bam_aux_del(bam1_t *b, uint8_t *s)
46524662
{
4653-
uint8_t *p, *aux;
4654-
int l_aux = bam_get_l_aux(b);
4655-
aux = bam_get_aux(b);
4656-
p = s - 2;
4657-
s = skip_aux(s, aux + l_aux);
4658-
if (s == NULL) goto bad_aux;
4659-
memmove(p, s, l_aux - (s - aux));
4660-
b->l_data -= s - p;
4661-
return 0;
4663+
s = bam_aux_remove(b, s);
4664+
return (s || errno == ENOENT)? 0 : -1;
4665+
}
4666+
4667+
uint8_t *bam_aux_remove(bam1_t *b, uint8_t *s)
4668+
{
4669+
uint8_t *end = b->data + b->l_data;
4670+
uint8_t *next = skip_aux(s, end);
4671+
if (next == NULL) goto bad_aux;
4672+
memmove(s-2, next, end - next);
4673+
b->l_data -= next - (s-2);
4674+
if (next >= end) { errno = ENOENT; return NULL; }
4675+
else return s;
46624676

46634677
bad_aux:
46644678
hts_log_error("Corrupted aux data for read %s", bam_get_qname(b));
46654679
errno = EINVAL;
4666-
return -1;
4680+
return NULL;
46674681
}
46684682

46694683
int bam_aux_update_str(bam1_t *b, const char tag[2], int len, const char *data)

test/sam.c

+39
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,15 @@ uint8_t *check_bam_aux_get(const bam1_t *aln, const char *tag, char type)
8787
return NULL;
8888
}
8989

90+
static void check_aux_count(const bam1_t *aln, int expected, const char *what)
91+
{
92+
const uint8_t *itr;
93+
int n = 0;
94+
for (itr = bam_aux_first(aln); itr; itr = bam_aux_next(aln, itr)) n++;
95+
if (n != expected)
96+
fail("%s has %d aux fields, expected %d", what, n, expected);
97+
}
98+
9099
static void check_int_B_array(bam1_t *aln, char *tag,
91100
uint32_t nvals, int64_t *vals) {
92101
uint8_t *p;
@@ -285,10 +294,30 @@ static int aux_fields1(void)
285294
if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
286295
fail("XA field is '%c', expected 'k'", bam_aux2A(p));
287296

297+
check_aux_count(aln, 24, "Original record");
298+
288299
bam_aux_del(aln,p);
289300
if (bam_aux_get(aln,"XA"))
290301
fail("XA field was not deleted");
291302

303+
check_aux_count(aln, 23, "Record post-XA-deletion");
304+
305+
p = bam_aux_get(aln, "Y2");
306+
if (p == NULL || strncmp(bam_aux_tag(p), "Y2", 2) != 0 || bam_aux_type(p) != 'i')
307+
fail("bam_aux_get() missed Y2 field");
308+
309+
p = bam_aux_next(aln, p);
310+
if (p == NULL || strncmp(bam_aux_tag(p), "Y3", 2) != 0 || bam_aux_type(p) != 'c')
311+
fail("bam_aux_next() missed Y3 field");
312+
313+
p = bam_aux_get(aln, "Y8");
314+
if (p == NULL || strncmp(bam_aux_tag(p), "Y8", 2) != 0 || bam_aux_type(p) != 'I')
315+
fail("bam_aux_get() missed Y8 field");
316+
317+
p = bam_aux_next(aln, p);
318+
if (p != NULL || errno != ENOENT)
319+
fail("bam_aux_next missed the end of fields");
320+
292321
if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
293322
fail("Xi field is %"PRId64", expected 37", bam_aux2i(p));
294323

@@ -492,6 +521,16 @@ static int aux_fields1(void)
492521

493522
if (strcmp(ks.s, r1) != 0)
494523
fail("record formatted incorrectly: \"%s\"", ks.s);
524+
525+
// Test field removal APIs -- after the strcmp(..., r1) check so that
526+
// can also check the formatting of the to-be-removed fields.
527+
528+
p = bam_aux_remove(aln, check_bam_aux_get(aln, "XH", 'H'));
529+
if (bam_aux_get(aln, "XH"))
530+
fail("XH field was not removed");
531+
check_aux_count(aln, 31, "Record post-XH-removal");
532+
if (strncmp(bam_aux_tag(p), "XB", 2) != 0 || bam_aux_type(p) != 'B')
533+
fail("bam_aux_remove() missed XB field");
495534
}
496535
else fail("can't read record");
497536

0 commit comments

Comments
 (0)