SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
format_bam.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <bit>
16#include <iterator>
17#include <ranges>
18#include <string>
19#include <vector>
20
33
34namespace seqan3
35{
36
50{
51public:
55 // string_buffer is of type std::string and has some problems with pre-C++11 ABI
56 format_bam() = default;
57 format_bam(format_bam const &) = default;
58 format_bam & operator=(format_bam const &) = default;
59 format_bam(format_bam &&) = default;
61 ~format_bam() = default;
62
64
67
68protected:
69 template <typename stream_type, // constraints checked by file
70 typename seq_legal_alph_type,
71 typename ref_seqs_type,
72 typename ref_ids_type,
73 typename stream_pos_type,
74 typename seq_type,
75 typename id_type,
76 typename offset_type,
77 typename ref_seq_type,
78 typename ref_id_type,
79 typename ref_offset_type,
80 typename cigar_type,
81 typename flag_type,
82 typename mapq_type,
83 typename qual_type,
84 typename mate_type,
85 typename tag_dict_type,
86 typename e_value_type,
87 typename bit_score_type>
88 void read_alignment_record(stream_type & stream,
89 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
90 ref_seqs_type & ref_seqs,
92 stream_pos_type & position_buffer,
93 seq_type & seq,
94 qual_type & qual,
95 id_type & id,
96 offset_type & offset,
97 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
98 ref_id_type & ref_id,
99 ref_offset_type & ref_offset,
100 cigar_type & cigar_vector,
101 flag_type & flag,
102 mapq_type & mapq,
103 mate_type & mate,
104 tag_dict_type & tag_dict,
105 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
106 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score));
107
108 template <typename stream_type,
109 typename header_type,
110 typename seq_type,
111 typename id_type,
112 typename ref_seq_type,
113 typename ref_id_type,
114 typename cigar_type,
115 typename qual_type,
116 typename mate_type,
117 typename tag_dict_type>
118 void write_alignment_record([[maybe_unused]] stream_type & stream,
119 [[maybe_unused]] sam_file_output_options const & options,
120 [[maybe_unused]] header_type && header,
121 [[maybe_unused]] seq_type && seq,
122 [[maybe_unused]] qual_type && qual,
123 [[maybe_unused]] id_type && id,
124 [[maybe_unused]] int32_t const offset,
125 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
126 [[maybe_unused]] ref_id_type && ref_id,
127 [[maybe_unused]] std::optional<int32_t> ref_offset,
128 [[maybe_unused]] cigar_type && cigar_vector,
129 [[maybe_unused]] sam_flag const flag,
130 [[maybe_unused]] uint8_t const mapq,
131 [[maybe_unused]] mate_type && mate,
132 [[maybe_unused]] tag_dict_type && tag_dict,
133 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
134 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score));
135
137 template <typename stream_t, typename header_type>
138 void write_header(stream_t & stream, sam_file_output_options const & options, header_type & header);
139
140private:
142 bool header_was_read{false};
143
146
149 { // naming corresponds to official SAM/BAM specifications
150 int32_t block_size;
151 int32_t refID;
152 int32_t pos;
153 uint32_t l_read_name : 8;
154 uint32_t mapq : 8;
155 uint32_t bin : 16;
156 uint32_t n_cigar_op : 16;
158 int32_t l_seq;
159 int32_t next_refID;
160 int32_t next_pos;
161 int32_t tlen;
162 };
163
164 // clang-format off
167 {
168 []() constexpr {
170
171 using index_t = std::make_unsigned_t<char>;
172
173 // ret['M'] = 0; set anyway by initialization
174 ret[static_cast<index_t>('I')] = 1;
175 ret[static_cast<index_t>('D')] = 2;
176 ret[static_cast<index_t>('N')] = 3;
177 ret[static_cast<index_t>('S')] = 4;
178 ret[static_cast<index_t>('H')] = 5;
179 ret[static_cast<index_t>('P')] = 6;
180 ret[static_cast<index_t>('=')] = 7;
181 ret[static_cast<index_t>('X')] = 8;
182
183 return ret;
184 }()
185 };
186 // clang-format on
187
189 static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
190 {
191 --end;
192 if (beg >> 14 == end >> 14)
193 return ((1 << 15) - 1) / 7 + (beg >> 14);
194 if (beg >> 17 == end >> 17)
195 return ((1 << 12) - 1) / 7 + (beg >> 17);
196 if (beg >> 20 == end >> 20)
197 return ((1 << 9) - 1) / 7 + (beg >> 20);
198 if (beg >> 23 == end >> 23)
199 return ((1 << 6) - 1) / 7 + (beg >> 23);
200 if (beg >> 26 == end >> 26)
201 return ((1 << 3) - 1) / 7 + (beg >> 26);
202 return 0;
203 }
204
211 template <typename stream_view_type, std::integral number_type>
212 void read_integral_byte_field(stream_view_type && stream_view, number_type & target)
213 {
214 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(target), reinterpret_cast<char *>(&target));
215 }
216
222 template <typename stream_view_type>
223 void read_float_byte_field(stream_view_type && stream_view, float & target)
224 {
225 std::ranges::copy_n(std::ranges::begin(stream_view), sizeof(int32_t), reinterpret_cast<char *>(&target));
226 }
227
228 template <typename stream_view_type, typename value_type>
230 stream_view_type && stream_view,
231 value_type const & SEQAN3_DOXYGEN_ONLY(value));
232
233 template <typename stream_view_type>
234 void read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target);
235
236 template <typename cigar_input_type>
237 auto parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const;
238
239 static std::string get_tag_dict_str(sam_tag_dictionary const & tag_dict);
240};
241
243template <typename stream_type, // constraints checked by file
244 typename seq_legal_alph_type,
245 typename ref_seqs_type,
246 typename ref_ids_type,
247 typename stream_pos_type,
248 typename seq_type,
249 typename id_type,
250 typename offset_type,
251 typename ref_seq_type,
252 typename ref_id_type,
253 typename ref_offset_type,
254 typename cigar_type,
255 typename flag_type,
256 typename mapq_type,
257 typename qual_type,
258 typename mate_type,
259 typename tag_dict_type,
260 typename e_value_type,
261 typename bit_score_type>
262inline void
264 sam_file_input_options<seq_legal_alph_type> const & SEQAN3_DOXYGEN_ONLY(options),
265 ref_seqs_type & ref_seqs,
267 stream_pos_type & position_buffer,
268 seq_type & seq,
269 qual_type & qual,
270 id_type & id,
271 offset_type & offset,
272 ref_seq_type & SEQAN3_DOXYGEN_ONLY(ref_seq),
273 ref_id_type & ref_id,
274 ref_offset_type & ref_offset,
275 cigar_type & cigar_vector,
276 flag_type & flag,
277 mapq_type & mapq,
278 mate_type & mate,
279 tag_dict_type & tag_dict,
280 e_value_type & SEQAN3_DOXYGEN_ONLY(e_value),
281 bit_score_type & SEQAN3_DOXYGEN_ONLY(bit_score))
282{
283 static_assert(detail::decays_to_ignore_v<ref_offset_type>
284 || detail::is_type_specialisation_of_v<ref_offset_type, std::optional>,
285 "The ref_offset must be a specialisation of std::optional.");
286
287 static_assert(detail::decays_to_ignore_v<mapq_type> || std::same_as<mapq_type, uint8_t>,
288 "The type of field::mapq must be uint8_t.");
289
290 static_assert(detail::decays_to_ignore_v<flag_type> || std::same_as<flag_type, sam_flag>,
291 "The type of field::flag must be seqan3::sam_flag.");
292
293 auto stream_view = seqan3::detail::istreambuf(stream);
294
295 [[maybe_unused]] int32_t offset_tmp{}; // needed in case the cigar string was stored in the tag dictionary
296 [[maybe_unused]] int32_t ref_length{}; // needed in case the cigar string was stored in the tag dictionary
297
298 // Header
299 // -------------------------------------------------------------------------------------------------------------
300 if (!header_was_read)
301 {
302 // magic BAM string
304 throw format_error{"File is not in BAM format."};
305
306 int32_t l_text{}; // length of header text including \0 character
307 int32_t n_ref{}; // number of reference sequences
308 int32_t l_name{}; // 1 + length of reference name including \0 character
309 int32_t l_ref{}; // length of reference sequence
310
311 read_integral_byte_field(stream_view, l_text);
312
313 if (l_text > 0) // header text is present
314 read_header(stream_view | detail::take_exactly_or_throw(l_text), header, ref_seqs);
315
316 read_integral_byte_field(stream_view, n_ref);
317
318 for (int32_t ref_idx = 0; ref_idx < n_ref; ++ref_idx)
319 {
320 read_integral_byte_field(stream_view, l_name);
321
322 string_buffer.resize(l_name - 1);
324 l_name - 1,
325 string_buffer.data()); // copy without \0 character
326 ++std::ranges::begin(stream_view); // skip \0 character
327
328 read_integral_byte_field(stream_view, l_ref);
329
330 if constexpr (detail::decays_to_ignore_v<ref_seqs_type>) // no reference information given
331 {
332 // If there was no header text, we parse reference sequences block as header information
333 if (l_text == 0)
334 {
335 auto & reference_ids = header.ref_ids();
336 // put the length of the reference sequence into ref_id_info
337 header.ref_id_info.emplace_back(l_ref, "");
338 // put the reference name into reference_ids
339 reference_ids.push_back(string_buffer);
340 // assign the reference name an ascending reference id (starts at index 0).
341 header.ref_dict.emplace(reference_ids.back(), reference_ids.size() - 1);
342 continue;
343 }
344 }
345
346 auto id_it = header.ref_dict.find(string_buffer);
347
348 // sanity checks of reference information to existing header object:
349 if (id_it == header.ref_dict.end()) // [unlikely]
350 {
351 throw format_error{detail::to_string("Unknown reference name '" + string_buffer
352 + "' found in BAM file header (header.ref_ids():",
353 header.ref_ids(),
354 ").")};
355 }
356 else if (id_it->second != ref_idx) // [unlikely]
357 {
358 throw format_error{detail::to_string("Reference id '",
360 "' at position ",
361 ref_idx,
362 " does not correspond to the position ",
363 id_it->second,
364 " in the header (header.ref_ids():",
365 header.ref_ids(),
366 ").")};
367 }
368 else if (std::get<0>(header.ref_id_info[id_it->second]) != l_ref) // [unlikely]
369 {
370 throw format_error{"Provided reference has unequal length as specified in the header."};
371 }
372 }
373
374 header_was_read = true;
375
376 if (std::ranges::begin(stream_view) == std::ranges::end(stream_view)) // no records follow
377 return;
378 }
379
380 // read alignment record into buffer
381 // -------------------------------------------------------------------------------------------------------------
382 position_buffer = stream.tellg();
384 std::ranges::copy(stream_view | detail::take_exactly_or_throw(sizeof(core)), reinterpret_cast<char *>(&core));
385
386 if (core.refID >= static_cast<int32_t>(header.ref_ids().size()) || core.refID < -1) // [[unlikely]]
387 {
388 throw format_error{detail::to_string("Reference id index '",
389 core.refID,
390 "' is not in range of ",
391 "header.ref_ids(), which has size ",
392 header.ref_ids().size(),
393 ".")};
394 }
395 else if (core.refID > -1) // not unmapped
396 {
397 ref_id = core.refID; // field::ref_id
398 }
399
400 flag = core.flag; // field::flag
401 mapq = core.mapq; // field::mapq
402
403 if (core.pos > -1) // [[likely]]
404 ref_offset = core.pos; // field::ref_offset
405
406 if constexpr (!detail::decays_to_ignore_v<mate_type>) // field::mate
407 {
408 if (core.next_refID > -1)
409 get<0>(mate) = core.next_refID;
410
411 if (core.next_pos > -1) // [[likely]]
412 get<1>(mate) = core.next_pos;
413
414 get<2>(mate) = core.tlen;
415 }
416
417 // read id
418 // -------------------------------------------------------------------------------------------------------------
419 auto id_view = stream_view | detail::take_exactly_or_throw(core.l_read_name - 1);
420 if constexpr (!detail::decays_to_ignore_v<id_type>)
421 read_forward_range_field(id_view, id); // field::id
422 else
423 detail::consume(id_view);
424 ++std::ranges::begin(stream_view); // skip '\0'
425
426 // read cigar string
427 // -------------------------------------------------------------------------------------------------------------
428 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
429 {
430 int32_t seq_length{};
431 std::tie(cigar_vector, ref_length, seq_length) = parse_binary_cigar(stream_view, core.n_cigar_op);
432 int32_t soft_clipping_end{};
433 transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
434 }
435 else
436 {
438 }
439
440 offset = offset_tmp;
441
442 // read sequence
443 // -------------------------------------------------------------------------------------------------------------
444 if (core.l_seq > 0) // sequence information is given
445 {
446 auto seq_stream = stream_view | detail::take_exactly_or_throw(core.l_seq / 2) // one too short if uneven
449 {
450 return {dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) >> 4)),
451 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(c) & 0x0f))};
452 });
453
454 if constexpr (detail::decays_to_ignore_v<seq_type>)
455 {
456 auto skip_sequence_bytes = [&]()
457 {
458 detail::consume(seq_stream);
459 if (core.l_seq & 1)
460 ++std::ranges::begin(stream_view);
461 };
462
463 skip_sequence_bytes();
464 }
465 else
466 {
467 using alph_t = std::ranges::range_value_t<decltype(seq)>;
468 constexpr auto from_dna16 = detail::convert_through_char_representation<dna16sam, alph_t>;
469
470 for (auto [d1, d2] : seq_stream)
471 {
472 seq.push_back(from_dna16[to_rank(d1)]);
473 seq.push_back(from_dna16[to_rank(d2)]);
474 }
475
476 if (core.l_seq & 1)
477 {
478 dna16sam d =
479 dna16sam{}.assign_rank(std::min(15, static_cast<uint8_t>(*std::ranges::begin(stream_view)) >> 4));
480 seq.push_back(from_dna16[to_rank(d)]);
481 ++std::ranges::begin(stream_view);
482 }
483 }
484 }
485
486 // read qual string
487 // -------------------------------------------------------------------------------------------------------------
488 auto qual_view = stream_view | detail::take_exactly_or_throw(core.l_seq)
490 [](char chr)
491 {
492 return static_cast<char>(chr + 33);
493 });
494 if constexpr (!detail::decays_to_ignore_v<qual_type>)
495 read_forward_range_field(qual_view, qual);
496 else
497 detail::consume(qual_view);
498
499 // All remaining optional fields if any: SAM tags dictionary
500 // -------------------------------------------------------------------------------------------------------------
501 int32_t remaining_bytes = core.block_size - (sizeof(alignment_record_core) - 4 /*block_size excluded*/)
502 - core.l_read_name - core.n_cigar_op * 4 - (core.l_seq + 1) / 2 - core.l_seq;
503 assert(remaining_bytes >= 0);
504 auto tags_view = stream_view | detail::take_exactly_or_throw(remaining_bytes);
505
506 while (tags_view.size() > 0)
507 {
508 if constexpr (!detail::decays_to_ignore_v<tag_dict_type>)
509 read_sam_dict_field(tags_view, tag_dict);
510 else
511 detail::consume(tags_view);
512 }
513
514 // DONE READING - wrap up
515 // -------------------------------------------------------------------------------------------------------------
516 if constexpr (!detail::decays_to_ignore_v<cigar_type>)
517 {
518 // Check cigar, if it matches ‘kSmN’, where ‘k’ equals lseq, ‘m’ is the reference sequence length in the
519 // alignment, and ‘S’ and ‘N’ are the soft-clipping and reference-clip, then the cigar string was larger
520 // than 65535 operations and is stored in the sam_tag_dictionary (tag GC).
521 if (core.l_seq != 0 && offset_tmp == core.l_seq)
522 {
523 if constexpr (detail::decays_to_ignore_v<tag_dict_type> | detail::decays_to_ignore_v<seq_type>)
524 { // maybe only throw in debug mode and otherwise return an empty alignment?
525 throw format_error{
526 detail::to_string("The cigar string '",
527 offset_tmp,
528 "S",
529 ref_length,
530 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
531 "stored in the optional field CG. You need to read in the field::tags and "
532 "field::seq in order to access this information.")};
533 }
534 else
535 {
536 auto it = tag_dict.find("CG"_tag);
537
538 if (it == tag_dict.end())
540 "The cigar string '",
541 offset_tmp,
542 "S",
543 ref_length,
544 "N' suggests that the cigar string exceeded 65535 elements and was therefore ",
545 "stored in the optional field CG but this tag is not present in the given ",
546 "record.")};
547
548 auto cigar_view = std::views::all(std::get<std::string>(it->second));
549 int32_t seq_length{};
550 std::tie(cigar_vector, ref_length, seq_length) = detail::parse_cigar(cigar_view);
551 offset_tmp = 0;
552 int32_t soft_clipping_end{};
553 transfer_soft_clipping_to(cigar_vector, offset_tmp, soft_clipping_end);
554 tag_dict.erase(it); // remove redundant information
555 }
556 }
557 }
558}
559
561template <typename stream_type,
562 typename header_type,
563 typename seq_type,
564 typename id_type,
565 typename ref_seq_type,
566 typename ref_id_type,
567 typename cigar_type,
568 typename qual_type,
569 typename mate_type,
570 typename tag_dict_type>
571inline void format_bam::write_alignment_record([[maybe_unused]] stream_type & stream,
572 [[maybe_unused]] sam_file_output_options const & options,
573 [[maybe_unused]] header_type && header,
574 [[maybe_unused]] seq_type && seq,
575 [[maybe_unused]] qual_type && qual,
576 [[maybe_unused]] id_type && id,
577 [[maybe_unused]] int32_t const offset,
578 [[maybe_unused]] ref_seq_type && SEQAN3_DOXYGEN_ONLY(ref_seq),
579 [[maybe_unused]] ref_id_type && ref_id,
580 [[maybe_unused]] std::optional<int32_t> ref_offset,
581 [[maybe_unused]] cigar_type && cigar_vector,
582 [[maybe_unused]] sam_flag const flag,
583 [[maybe_unused]] uint8_t const mapq,
584 [[maybe_unused]] mate_type && mate,
585 [[maybe_unused]] tag_dict_type && tag_dict,
586 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(e_value),
587 [[maybe_unused]] double SEQAN3_DOXYGEN_ONLY(bit_score))
588{
589 // ---------------------------------------------------------------------
590 // Type Requirements (as static asserts for user friendliness)
591 // ---------------------------------------------------------------------
592 static_assert((std::ranges::forward_range<seq_type> && alphabet<std::ranges::range_reference_t<seq_type>>),
593 "The seq object must be a std::ranges::forward_range over "
594 "letters that model seqan3::alphabet.");
595
596 static_assert((std::ranges::forward_range<id_type> && alphabet<std::ranges::range_reference_t<id_type>>),
597 "The id object must be a std::ranges::forward_range over "
598 "letters that model seqan3::alphabet.");
599
600 static_assert((std::ranges::forward_range<ref_seq_type> && alphabet<std::ranges::range_reference_t<ref_seq_type>>),
601 "The ref_seq object must be a std::ranges::forward_range "
602 "over letters that model seqan3::alphabet.");
603
604 if constexpr (!detail::decays_to_ignore_v<ref_id_type>)
605 {
606 static_assert((std::ranges::forward_range<ref_id_type> || std::integral<std::remove_reference_t<ref_id_type>>
607 || detail::is_type_specialisation_of_v<std::remove_cvref_t<ref_id_type>, std::optional>),
608 "The ref_id object must be a std::ranges::forward_range "
609 "over letters that model seqan3::alphabet or an integral or a std::optional<integral>.");
610 }
611
612 static_assert((std::ranges::forward_range<qual_type> && alphabet<std::ranges::range_reference_t<qual_type>>),
613 "The qual object must be a std::ranges::forward_range "
614 "over letters that model seqan3::alphabet.");
615
617 "The mate object must be a std::tuple of size 3 with "
618 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
619 "2) a std::integral or std::optional<std::integral>, and "
620 "3) a std::integral.");
621
622 static_assert(
623 ((std::ranges::forward_range<decltype(std::get<0>(mate))>
624 || std::integral<std::remove_cvref_t<decltype(std::get<0>(mate))>>
625 || detail::is_type_specialisation_of_v<
626 std::remove_cvref_t<decltype(std::get<0>(mate))>,
627 std::optional>)&&(std::integral<std::remove_cvref_t<decltype(std::get<1>(mate))>>
628 || detail::is_type_specialisation_of_v<
629 std::remove_cvref_t<decltype(std::get<1>(mate))>,
630 std::optional>)&&std::integral<std::remove_cvref_t<decltype(std::get<2>(mate))>>),
631 "The mate object must be a std::tuple of size 3 with "
632 "1) a std::ranges::forward_range with a value_type modelling seqan3::alphabet, "
633 "2) a std::integral or std::optional<std::integral>, and "
634 "3) a std::integral.");
635
636 static_assert(std::same_as<std::remove_cvref_t<tag_dict_type>, sam_tag_dictionary>,
637 "The tag_dict object must be of type seqan3::sam_tag_dictionary.");
638
639 if constexpr (detail::decays_to_ignore_v<header_type>)
640 {
641 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
642 "You can either construct the output file with reference names and reference length "
643 "information and the header will be created for you, or you can access the `header` member "
644 "directly."};
645 }
646 else
647 {
648 // ---------------------------------------------------------------------
649 // logical Requirements
650 // ---------------------------------------------------------------------
651
652 if (ref_offset.has_value() && (ref_offset.value() + 1) < 0)
653 throw format_error{detail::to_string("The ref_offset object must be >= -1 but is: ", ref_offset)};
654
655 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
656
657 // ---------------------------------------------------------------------
658 // Writing the BAM Header on first call
659 // ---------------------------------------------------------------------
661 {
662 write_header(stream, options, header);
663 header_was_written = true;
664 }
665
666 // ---------------------------------------------------------------------
667 // Writing the Record
668 // ---------------------------------------------------------------------
669 int32_t ref_length{};
670
671 // Compute the ref_length from given cigar_vector which is needed to fill field `bin`.
672 if (!std::ranges::empty(cigar_vector))
673 {
674 int32_t dummy_seq_length{};
675 for (auto & [count, operation] : cigar_vector)
676 detail::update_alignment_lengths(ref_length, dummy_seq_length, operation.to_char(), count);
677 }
678
679 if (cigar_vector.size() >= (1 << 16)) // must be written into the sam tag CG
680 {
681 tag_dict["CG"_tag] = detail::get_cigar_string(cigar_vector);
682 cigar_vector.resize(2);
683 cigar_vector[0] = cigar{static_cast<uint32_t>(std::ranges::distance(seq)), 'S'_cigar_operation};
684 cigar_vector[1] = cigar{static_cast<uint32_t>(ref_length), 'N'_cigar_operation};
685 }
686
687 std::string tag_dict_binary_str = get_tag_dict_str(tag_dict);
688
689 // Compute the value for the l_read_name field for the bam record.
690 // This value is stored including a trailing `0`, so at most 254 characters of the id can be stored, since
691 // the data type to store the value is uint8_t and 255 is the maximal size.
692 // If the id is empty a '*' is written instead, i.e. the written id is never an empty string and stores at least
693 // 2 bytes.
694 uint8_t read_name_size = std::min<uint8_t>(std::ranges::distance(id), 254) + 1;
695 read_name_size += static_cast<uint8_t>(read_name_size == 1); // need size two since empty id is stored as '*'.
696
697 alignment_record_core core{/* block_size */ 0, // will be initialised right after
698 /* refID */ -1, // will be initialised right after
699 /* pos */ ref_offset.value_or(-1),
700 /* l_read_name */ read_name_size,
701 /* mapq */ mapq,
702 /* bin */ reg2bin(ref_offset.value_or(-1), ref_length),
703 /* n_cigar_op */ static_cast<uint16_t>(cigar_vector.size()),
704 /* flag */ flag,
705 /* l_seq */ static_cast<int32_t>(std::ranges::distance(seq)),
706 /* next_refId */ -1, // will be initialised right after
707 /* next_pos */ get<1>(mate).value_or(-1),
708 /* tlen */ get<2>(mate)};
709
710 auto check_and_assign_id_to = [&header]([[maybe_unused]] auto & id_source, [[maybe_unused]] auto & id_target)
711 {
712 using id_t = std::remove_reference_t<decltype(id_source)>;
713
714 if constexpr (!detail::decays_to_ignore_v<id_t>)
715 {
716 if constexpr (std::integral<id_t>)
717 {
718 id_target = id_source;
719 }
720 else if constexpr (detail::is_type_specialisation_of_v<id_t, std::optional>)
721 {
722 id_target = id_source.value_or(-1);
723 }
724 else
725 {
726 if (!std::ranges::empty(id_source)) // otherwise default will remain (-1)
727 {
728 auto id_it = header.ref_dict.end();
729
730 if constexpr (std::ranges::contiguous_range<decltype(id_source)>
731 && std::ranges::sized_range<decltype(id_source)>
732 && std::ranges::borrowed_range<decltype(id_source)>)
733 {
734 id_it = header.ref_dict.find(
735 std::span{std::ranges::data(id_source), std::ranges::size(id_source)});
736 }
737 else
738 {
739 using header_ref_id_type = std::remove_reference_t<decltype(header.ref_ids()[0])>;
740
741 static_assert(
742 implicitly_convertible_to<decltype(id_source), header_ref_id_type>,
743 "The ref_id type is not convertible to the reference id information stored in the "
744 "reference dictionary of the header object.");
745
746 id_it = header.ref_dict.find(id_source);
747 }
748
749 if (id_it == header.ref_dict.end())
750 {
751 throw format_error{detail::to_string("Unknown reference name '",
752 id_source,
753 "' could "
754 "not be found in BAM header ref_dict: ",
755 header.ref_dict,
756 ".")};
757 }
758
759 id_target = id_it->second;
760 }
761 }
762 }
763 };
764
765 // initialise core.refID
766 check_and_assign_id_to(ref_id, core.refID);
767
768 // initialise core.next_refID
769 check_and_assign_id_to(get<0>(mate), core.next_refID);
770
771 // initialise core.block_size
772 core.block_size = sizeof(core) - 4 /*block_size excluded*/ + core.l_read_name + core.n_cigar_op * 4
773 + // each int32_t has 4 bytes
774 (core.l_seq + 1) / 2 + // bitcompressed seq
775 core.l_seq + // quality string
776 tag_dict_binary_str.size();
777
778 std::ranges::copy_n(reinterpret_cast<char *>(&core), sizeof(core), stream_it); // write core
779
780 if (std::ranges::empty(id)) // empty id is represented as * for backward compatibility
781 stream_it = '*';
782 else
783 std::ranges::copy_n(std::ranges::begin(id), core.l_read_name - 1, stream_it); // write read id
784 stream_it = '\0';
785
786 // write cigar
787 for (auto [cigar_count, op] : cigar_vector)
788 {
789 cigar_count = cigar_count << 4;
790 cigar_count |= static_cast<int32_t>(char_to_sam_rank[op.to_char()]);
791 std::ranges::copy_n(reinterpret_cast<char *>(&cigar_count), 4, stream_it);
792 }
793
794 // write seq (bit-compressed: dna16sam characters go into one byte)
795 using alph_t = std::ranges::range_value_t<seq_type>;
796 constexpr auto to_dna16 = detail::convert_through_char_representation<alph_t, dna16sam>;
797
798 auto sit = std::ranges::begin(seq);
799 for (int32_t sidx = 0; sidx < ((core.l_seq & 1) ? core.l_seq - 1 : core.l_seq); ++sidx, ++sit)
800 {
801 uint8_t compressed_chr = to_rank(to_dna16[to_rank(*sit)]) << 4;
802 ++sidx, ++sit;
803 compressed_chr |= to_rank(to_dna16[to_rank(*sit)]);
804 stream_it = static_cast<char>(compressed_chr);
805 }
806
807 if (core.l_seq & 1) // write one more
808 stream_it = static_cast<char>(to_rank(to_dna16[to_rank(*sit)]) << 4);
809
810 // write qual
811 if (std::ranges::empty(qual))
812 {
813 auto v = views::repeat_n(static_cast<char>(255), core.l_seq);
814 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
815 }
816 else
817 {
818 if (std::ranges::distance(qual) != core.l_seq)
819 throw format_error{detail::to_string("Expected quality of same length as sequence with size ",
820 core.l_seq,
821 ". Got quality with size ",
822 std::ranges::distance(qual),
823 " instead.")};
824
825 auto v = qual
827 [](auto chr)
828 {
829 return static_cast<char>(to_rank(chr));
830 });
831 std::ranges::copy_n(v.begin(), core.l_seq, stream_it);
832 }
833
834 // write optional fields
835 stream << tag_dict_binary_str;
836 } // if constexpr (!detail::decays_to_ignore_v<header_type>)
837}
838
840template <typename stream_t, typename header_type>
841inline void format_bam::write_header(stream_t & stream, sam_file_output_options const & options, header_type & header)
842{
843 if constexpr (detail::decays_to_ignore_v<header_type>)
844 {
845 throw format_error{"BAM can only be written with a header but you did not provide enough information! "
846 "You can either construct the output file with reference names and reference length "
847 "information and the header will be created for you, or you can access the `header` member "
848 "directly."};
849 }
850 else
851 {
852 detail::fast_ostreambuf_iterator stream_it{*stream.rdbuf()};
853
854 std::ranges::copy_n("BAM\1", 4, stream_it); // Do not copy the null terminator
855
856 // write SAM header to temporary stream first to query its size.
858 detail::format_sam_base::write_header(os, options, header);
859#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10)
860 int32_t const l_text{static_cast<int32_t>(os.str().size())};
861#else
862 int32_t const l_text{static_cast<int32_t>(os.view().size())};
863#endif
864 std::ranges::copy_n(reinterpret_cast<char const *>(&l_text), 4, stream_it); // write text length
865
866#if SEQAN3_COMPILER_IS_GCC && (__GNUC__ == 10)
867 auto header_view = os.str();
868#else
869 auto header_view = os.view();
870#endif
871 std::ranges::copy(header_view, stream_it);
872
873 assert(header.ref_ids().size() < (1ull << 32));
874 int32_t const n_ref{static_cast<int32_t>(header.ref_ids().size())};
875 std::ranges::copy_n(reinterpret_cast<char const *>(&n_ref), 4, stream_it); // write number of references
876
877 for (int32_t ridx = 0; ridx < n_ref; ++ridx)
878 {
879 assert(header.ref_ids()[ridx].size() + 1 < (1ull << 32));
880 int32_t const l_name{static_cast<int32_t>(header.ref_ids()[ridx].size()) + 1}; // plus null character
881 std::ranges::copy_n(reinterpret_cast<char const *>(&l_name), 4, stream_it); // write l_name
882 // write reference name:
883 std::ranges::copy(header.ref_ids()[ridx], stream_it);
884 stream_it = '\0'; // ++ is not necessary for ostream_iterator
885 // write reference sequence length:
886 std::ranges::copy_n(reinterpret_cast<char *>(&get<0>(header.ref_id_info[ridx])), 4, stream_it);
887 }
888 }
889}
890
892template <typename stream_view_type, typename value_type>
894 stream_view_type && stream_view,
895 value_type const & SEQAN3_DOXYGEN_ONLY(value))
896{
897 int32_t count;
898 read_integral_byte_field(stream_view, count); // read length of vector
899 std::vector<value_type> tmp_vector;
900 tmp_vector.reserve(count);
901
902 while (count > 0)
903 {
904 value_type tmp{};
905 if constexpr (std::integral<value_type>)
906 {
907 read_integral_byte_field(stream_view, tmp);
908 }
909 else if constexpr (std::same_as<value_type, float>)
910 {
911 read_float_byte_field(stream_view, tmp);
912 }
913 else
914 {
915 constexpr bool always_false = std::is_same_v<value_type, void>;
916 static_assert(always_false, "format_bam::read_sam_dict_vector: unsupported value_type");
917 }
918 tmp_vector.push_back(std::move(tmp));
919 --count;
920 }
921 variant = std::move(tmp_vector);
922}
923
941template <typename stream_view_type>
942inline void format_bam::read_sam_dict_field(stream_view_type && stream_view, sam_tag_dictionary & target)
943{
944 /* Every BA< tag has the format "[TAG][TYPE_ID][VALUE]", where TAG is a two letter
945 name tag which is converted to a unique integer identifier and TYPE_ID is one character in [A,i,Z,H,B,f]
946 describing the type for the upcoming VALUES. If TYPE_ID=='B' it signals an array of
947 VALUE's and the inner value type is identified by the next character, one of [cCsSiIf], followed
948 by the length (int32_t) of the array, followed by the values.
949 */
950 auto it = std::ranges::begin(stream_view);
951 uint16_t tag = static_cast<uint16_t>(*it) << 8;
952 ++it; // skip char read before
953
954 tag += static_cast<uint16_t>(*it);
955 ++it; // skip char read before
956
957 char type_id = *it;
958 ++it; // skip char read before
959
960 switch (type_id)
961 {
962 case 'A': // char
963 {
964 target[tag] = *it;
965 ++it; // skip char that has been read
966 break;
967 }
968 // all integer sizes are possible
969 case 'c': // int8_t
970 {
971 int8_t tmp;
972 read_integral_byte_field(stream_view, tmp);
973 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
974 break;
975 }
976 case 'C': // uint8_t
977 {
978 uint8_t tmp;
979 read_integral_byte_field(stream_view, tmp);
980 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
981 break;
982 }
983 case 's': // int16_t
984 {
985 int16_t tmp;
986 read_integral_byte_field(stream_view, tmp);
987 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
988 break;
989 }
990 case 'S': // uint16_t
991 {
992 uint16_t tmp;
993 read_integral_byte_field(stream_view, tmp);
994 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
995 break;
996 }
997 case 'i': // int32_t
998 {
999 int32_t tmp;
1000 read_integral_byte_field(stream_view, tmp);
1001 target[tag] = std::move(tmp); // readable sam format only allows int32_t
1002 break;
1003 }
1004 case 'I': // uint32_t
1005 {
1006 uint32_t tmp;
1007 read_integral_byte_field(stream_view, tmp);
1008 target[tag] = static_cast<int32_t>(tmp); // readable sam format only allows int32_t
1009 break;
1010 }
1011 case 'f': // float
1012 {
1013 float tmp;
1014 read_float_byte_field(stream_view, tmp);
1015 target[tag] = tmp;
1016 break;
1017 }
1018 case 'Z': // string
1019 {
1021 while (!is_char<'\0'>(*it))
1022 {
1024 ++it;
1025 }
1026 ++it; // skip \0
1027 target[tag] = string_buffer;
1028 break;
1029 }
1030 case 'H': // byte array, represented as null-terminated string; specification requires even number of bytes
1031 {
1032 std::vector<std::byte> byte_array;
1033 std::byte value;
1034 while (!is_char<'\0'>(*it))
1035 {
1038 ++it;
1039
1040 if (*it == '\0')
1041 throw format_error{"Hexadecimal tag has an uneven number of digits!"};
1042
1044 ++it;
1046 byte_array.push_back(value);
1047 }
1048 ++it; // skip \0
1049 target[tag] = byte_array;
1050 break;
1051 }
1052 case 'B': // Array. Value type depends on second char [cCsSiIf]
1053 {
1054 char array_value_type_id = *it;
1055 ++it; // skip char read before
1056
1057 switch (array_value_type_id)
1058 {
1059 case 'c': // int8_t
1060 read_sam_dict_vector(target[tag], stream_view, int8_t{});
1061 break;
1062 case 'C': // uint8_t
1063 read_sam_dict_vector(target[tag], stream_view, uint8_t{});
1064 break;
1065 case 's': // int16_t
1066 read_sam_dict_vector(target[tag], stream_view, int16_t{});
1067 break;
1068 case 'S': // uint16_t
1069 read_sam_dict_vector(target[tag], stream_view, uint16_t{});
1070 break;
1071 case 'i': // int32_t
1072 read_sam_dict_vector(target[tag], stream_view, int32_t{});
1073 break;
1074 case 'I': // uint32_t
1075 read_sam_dict_vector(target[tag], stream_view, uint32_t{});
1076 break;
1077 case 'f': // float
1078 read_sam_dict_vector(target[tag], stream_view, float{});
1079 break;
1080 default:
1081 throw format_error{detail::to_string("The first character in the numerical id of a SAM tag ",
1082 "must be one of [cCsSiIf] but '",
1083 array_value_type_id,
1084 "' was given.")};
1085 }
1086 break;
1087 }
1088 default:
1089 throw format_error{detail::to_string("The second character in the numerical id of a "
1090 "SAM tag must be one of [A,i,Z,H,B,f] but '",
1091 type_id,
1092 "' was given.")};
1093 }
1094}
1095
1110template <typename cigar_input_type>
1111inline auto format_bam::parse_binary_cigar(cigar_input_type && cigar_input, uint16_t n_cigar_op) const
1112{
1113 std::vector<cigar> operations{};
1114 char operation{'\0'};
1115 uint32_t count{};
1116 int32_t ref_length{}, seq_length{};
1117 uint32_t operation_and_count{}; // in BAM operation and count values are stored within one 32 bit integer
1118 constexpr char const * cigar_mapping = "MIDNSHP=X*******";
1119 constexpr uint32_t cigar_mask = 0x0f; // 0000000000001111
1120
1121 if (n_cigar_op == 0) // [[unlikely]]
1122 return std::tuple{operations, ref_length, seq_length};
1123
1124 // parse the rest of the cigar
1125 // -------------------------------------------------------------------------------------------------------------
1126 while (n_cigar_op > 0) // until stream is not empty
1127 {
1129 sizeof(operation_and_count),
1130 reinterpret_cast<char *>(&operation_and_count));
1131 operation = cigar_mapping[operation_and_count & cigar_mask];
1132 count = operation_and_count >> 4;
1133
1134 detail::update_alignment_lengths(ref_length, seq_length, operation, count);
1135 operations.emplace_back(count, cigar::operation{}.assign_char(operation));
1136 --n_cigar_op;
1137 }
1138
1139 return std::tuple{operations, ref_length, seq_length};
1140}
1141
1146{
1147 std::string result{};
1148
1149 auto stream_variant_fn = [&result](auto && arg) // helper to print a std::variant
1150 {
1151 // T is either char, int32_t, float, std::string, or a std::vector<some int>
1152 using T = std::remove_cvref_t<decltype(arg)>;
1153
1154 if constexpr (std::same_as<T, int32_t>)
1155 {
1156 // always choose the smallest possible representation [cCsSiI]
1157 size_t const absolute_arg = std::abs(arg);
1158 auto n = std::countr_zero(std::bit_ceil(absolute_arg + 1u) >> 1u) / 8u;
1159 bool const negative = arg < 0;
1160 n = n * n + 2 * negative; // for switch case order
1161
1162 switch (n)
1163 {
1164 case 0:
1165 {
1166 result[result.size() - 1] = 'C';
1167 result.append(reinterpret_cast<char const *>(&arg), 1);
1168 break;
1169 }
1170 case 1:
1171 {
1172 result[result.size() - 1] = 'S';
1173 result.append(reinterpret_cast<char const *>(&arg), 2);
1174 break;
1175 }
1176 case 2:
1177 {
1178 result[result.size() - 1] = 'c';
1179 int8_t tmp = static_cast<int8_t>(arg);
1180 result.append(reinterpret_cast<char const *>(&tmp), 1);
1181 break;
1182 }
1183 case 3:
1184 {
1185 result[result.size() - 1] = 's';
1186 int16_t tmp = static_cast<int16_t>(arg);
1187 result.append(reinterpret_cast<char const *>(&tmp), 2);
1188 break;
1189 }
1190 default:
1191 {
1192 result.append(reinterpret_cast<char const *>(&arg), 4); // always i
1193 break;
1194 }
1195 }
1196 }
1197 else if constexpr (std::same_as<T, std::string>)
1198 {
1199 result.append(reinterpret_cast<char const *>(arg.data()), arg.size() + 1 /*+ null character*/);
1200 }
1201 else if constexpr (!std::ranges::range<T>) // char, float
1202 {
1203 result.append(reinterpret_cast<char const *>(&arg), sizeof(arg));
1204 }
1205 else // std::vector of some arithmetic_type type
1206 {
1207 int32_t sz{static_cast<int32_t>(arg.size())};
1208 result.append(reinterpret_cast<char *>(&sz), 4);
1209 result.append(reinterpret_cast<char const *>(arg.data()),
1210 arg.size() * sizeof(std::ranges::range_value_t<T>));
1211 }
1212 };
1213
1214 for (auto & [tag, variant] : tag_dict)
1215 {
1216 result.push_back(static_cast<char>(tag / 256));
1217 result.push_back(static_cast<char>(tag % 256));
1218
1219 result.push_back(detail::sam_tag_type_char[variant.index()]);
1220
1221 if (!is_char<'\0'>(detail::sam_tag_type_char_extra[variant.index()]))
1222 result.push_back(detail::sam_tag_type_char_extra[variant.index()]);
1223
1224 std::visit(stream_variant_fn, variant);
1225 }
1226
1227 return result;
1228}
1229
1230} // namespace seqan3
T begin(T... args)
T bit_ceil(T... args)
constexpr derived_type & assign_char(char_type const chr) noexcept
Assign from a character, implicitly converts invalid characters.
Definition: alphabet_base.hpp:163
constexpr derived_type & assign_rank(rank_type const c) noexcept
Assign from a numeric value.
Definition: alphabet_base.hpp:187
The seqan3::cigar semialphabet pairs a counter with a seqan3::cigar::operation letter.
Definition: alphabet/cigar/cigar.hpp:60
Functionally the same as std::ostreambuf_iterator, but offers writing a range more efficiently.
Definition: fast_ostreambuf_iterator.hpp:40
The alignment base format.
Definition: format_sam_base.hpp:45
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition: format_sam_base.hpp:636
void transfer_soft_clipping_to(std::vector< cigar > const &cigar_vector, int32_t &sc_begin, int32_t &sc_end) const
Transfer soft clipping information from the cigar_vector to sc_begin and sc_end.
Definition: format_sam_base.hpp:150
void read_forward_range_field(stream_view_type &&stream_view, target_range_type &target)
Reads a range by copying from stream_view to target, converting values with seqan3::views::char_to.
Definition: format_sam_base.hpp:231
bool header_was_written
A variable that tracks whether the content of header has been written or not.
Definition: format_sam_base.hpp:66
void read_header(stream_view_type &&stream_view, sam_file_header< ref_ids_type > &hdr, ref_seqs_type &)
Reads the SAM header.
Definition: format_sam_base.hpp:299
void read_byte_field(stream_view_t &&stream_view, std::byte &byte_target)
Reads std::byte fields using std::from_chars.
Definition: format_sam_base.hpp:203
A 16 letter DNA alphabet, containing all IUPAC symbols minus the gap and plus an equality sign ('=')....
Definition: dna16sam.hpp:48
The actual implementation of seqan3::cigar::operation for documentation purposes only....
Definition: cigar_operation.hpp:48
The BAM format.
Definition: format_bam.hpp:50
void read_sam_dict_field(stream_view_type &&stream_view, sam_tag_dictionary &target)
Reads the optional tag fields into the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:942
format_bam()=default
Defaulted.
void read_alignment_record(stream_type &stream, sam_file_input_options< seq_legal_alph_type > const &options, ref_seqs_type &ref_seqs, sam_file_header< ref_ids_type > &header, stream_pos_type &position_buffer, seq_type &seq, qual_type &qual, id_type &id, offset_type &offset, ref_seq_type &ref_seq, ref_id_type &ref_id, ref_offset_type &ref_offset, cigar_type &cigar_vector, flag_type &flag, mapq_type &mapq, mate_type &mate, tag_dict_type &tag_dict, e_value_type &e_value, bit_score_type &bit_score)
Read from the specified stream and back-insert into the given field buffers.
Definition: format_bam.hpp:263
format_bam & operator=(format_bam &&)=default
Defaulted.
format_bam & operator=(format_bam const &)=default
Defaulted.
void write_header(stream_t &stream, sam_file_output_options const &options, header_type &header)
Writes the SAM header.
Definition: format_bam.hpp:841
format_bam(format_bam &&)=default
Defaulted.
void read_integral_byte_field(stream_view_type &&stream_view, number_type &target)
Reads a arithmetic field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:212
~format_bam()=default
Defaulted.
format_bam(format_bam const &)=default
Defaulted.
auto parse_binary_cigar(cigar_input_type &&cigar_input, uint16_t n_cigar_op) const
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: format_bam.hpp:1111
void write_alignment_record(stream_type &stream, sam_file_output_options const &options, header_type &&header, seq_type &&seq, qual_type &&qual, id_type &&id, int32_t const offset, ref_seq_type &&ref_seq, ref_id_type &&ref_id, std::optional< int32_t > ref_offset, cigar_type &&cigar_vector, sam_flag const flag, uint8_t const mapq, mate_type &&mate, tag_dict_type &&tag_dict, double e_value, double bit_score)
Write the given fields to the specified stream.
Definition: format_bam.hpp:571
void read_float_byte_field(stream_view_type &&stream_view, float &target)
Reads a float field from binary stream by directly reinterpreting the bits.
Definition: format_bam.hpp:223
static std::string get_tag_dict_str(sam_tag_dictionary const &tag_dict)
Writes the optional fields of the seqan3::sam_tag_dictionary.
Definition: format_bam.hpp:1145
std::string string_buffer
Local buffer to read into while avoiding reallocation.
Definition: format_bam.hpp:145
static uint16_t reg2bin(int32_t beg, int32_t end) noexcept
Computes the bin number for a given region [beg, end), copied from the official SAM specifications.
Definition: format_bam.hpp:189
void read_sam_dict_vector(seqan3::detail::sam_tag_variant &variant, stream_view_type &&stream_view, value_type const &value)
Reads a list of values separated by comma as it is the case for SAM tag arrays.
Definition: format_bam.hpp:893
bool header_was_read
A variable that tracks whether the content of header has been read or not.
Definition: format_bam.hpp:142
static constexpr std::array< uint8_t, 256 > char_to_sam_rank
Converts a cigar op character to the rank according to the official BAM specifications.
Definition: format_bam.hpp:167
static std::vector< std::string > file_extensions
The valid file extensions for this format; note that you can modify this value.
Definition: format_bam.hpp:66
Stores the header information of SAM/BAM files.
Definition: header.hpp:34
ref_ids_type & ref_ids()
The range of reference ids.
Definition: header.hpp:143
std::unordered_map< key_type, int32_t, key_hasher, detail::view_equality_fn > ref_dict
The mapping of reference id to position in the ref_ids() range and the ref_id_info range.
Definition: header.hpp:182
std::vector< std::tuple< int32_t, std::string > > ref_id_info
The reference information. (used by the SAM/BAM format)
Definition: header.hpp:179
The SAM tag dictionary class that stores all optional SAM fields.
Definition: sam_tag_dictionary.hpp:343
T clear(T... args)
T copy(T... args)
T copy_n(T... args)
T countr_zero(T... args)
T data(T... args)
Provides seqan3::dna16sam.
T emplace_back(T... args)
T equal(T... args)
Provides seqan3::detail::fast_ostreambuf_iterator.
T find(T... args)
Provides the seqan3::format_sam_base that can be inherited from.
constexpr auto to_rank
Return the rank representation of a (semi-)alphabet object.
Definition: alphabet/concept.hpp:155
constexpr void consume(rng_t &&rng)
Iterate over a range (consumes single-pass input ranges).
Definition: core/range/detail/misc.hpp:28
constexpr auto all
Returns a view that includes all elements of the range argument.
Definition: all_view.hpp:204
sam_flag
An enum flag that describes the properties of an aligned read (given as a SAM record).
Definition: sam_flag.hpp:76
constexpr std::tuple< std::vector< cigar >, int32_t, int32_t > parse_cigar(cigar_input_type &&cigar_input)
Parses a cigar string into a vector of operation-count pairs (e.g. (M, 3)).
Definition: io/sam_file/detail/cigar.hpp:94
std::string get_cigar_string(std::vector< cigar > const &cigar_vector)
Transforms a vector of cigar elements into a string representation.
Definition: io/sam_file/detail/cigar.hpp:128
constexpr char sam_tag_type_char_extra[12]
Each types SAM tag type extra char id. Index corresponds to the seqan3::detail::sam_tag_variant types...
Definition: sam_tag_dictionary.hpp:45
void update_alignment_lengths(int32_t &ref_length, int32_t &seq_length, char const cigar_operation, uint32_t const cigar_count)
Updates the sequence lengths by cigar_count depending on the cigar operation op.
Definition: io/sam_file/detail/cigar.hpp:51
constexpr char sam_tag_type_char[12]
Each SAM tag type char identifier. Index corresponds to the seqan3::detail::sam_tag_variant types.
Definition: sam_tag_dictionary.hpp:42
constexpr auto take_exactly_or_throw
A view adaptor that returns the first size elements from the underlying range and also exposes size i...
Definition: take_exactly_view.hpp:590
constexpr auto istreambuf
A view factory that returns a view over the stream buffer of an input stream.
Definition: istreambuf_view.hpp:107
@ flag
The alignment flag (bit information), uint16_t value.
@ ref_offset
Sequence (seqan3::field::ref_seq) relative start position (0-based), unsigned value.
@ mapq
The mapping quality of the seqan3::field::seq alignment, usually a Phred-scaled score.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ mate
The mate pair information given as a std::tuple of reference name, offset and template length.
@ ref_id
The identifier of the (reference) sequence that seqan3::field::seq was aligned to.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
@ qual
The qualities, usually in Phred score notation.
decltype(detail::transform< trait_t >(list_t{})) transform
Apply a transformation trait to every type in the list and return a seqan3::type_list of the results.
Definition: type_list/traits.hpp:470
constexpr ptrdiff_t count
Count the occurrences of a type in a pack.
Definition: type_pack/traits.hpp:164
constexpr size_t size
The size of a type pack.
Definition: type_pack/traits.hpp:146
constexpr auto repeat_n
A view factory that repeats a given value n times.
Definition: repeat_n.hpp:91
Provides the seqan3::sam_file_header class.
T index(T... args)
The generic alphabet concept that covers most data types used in ranges.
Checks whether from can be implicityly converted to to.
Whether a type behaves like a tuple.
Auxiliary functions for the SAM IO.
Provides seqan3::detail::istreambuf.
T min(T... args)
std::string to_string(value_type &&... values)
Streams all parameters via the seqan3::debug_stream and returns a concatenated string.
Definition: to_string.hpp:29
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::debug_stream and related types.
T push_back(T... args)
T reserve(T... args)
T resize(T... args)
Provides seqan3::sam_file_input_options.
Provides helper data structures for the seqan3::sam_file_output.
Provides the seqan3::sam_tag_dictionary class and auxiliaries.
T size(T... args)
Provides seqan3::views::slice.
T str(T... args)
Stores all fixed length variables which can be read/written directly by reinterpreting the binary str...
Definition: format_bam.hpp:149
uint32_t bin
The bin number.
Definition: format_bam.hpp:155
sam_flag flag
The flag value (uint16_t enum).
Definition: format_bam.hpp:157
uint32_t n_cigar_op
The number of cigar operations of the alignment.
Definition: format_bam.hpp:156
int32_t next_refID
The reference id of the mate.
Definition: format_bam.hpp:159
int32_t l_seq
The number of bases of the read sequence.
Definition: format_bam.hpp:158
int32_t refID
The reference id the read was mapped to.
Definition: format_bam.hpp:151
int32_t pos
The begin position of the alignment.
Definition: format_bam.hpp:152
int32_t block_size
The size in bytes of the whole BAM record.
Definition: format_bam.hpp:150
uint32_t l_read_name
The length of the read name including the \0 character.
Definition: format_bam.hpp:153
int32_t tlen
The template length of the read and its mate.
Definition: format_bam.hpp:161
uint32_t mapq
The mapping quality.
Definition: format_bam.hpp:154
int32_t next_pos
The begin position of the mate alignment.
Definition: format_bam.hpp:160
Thrown if information given to output format didn't match expectations.
Definition: io/exception.hpp:91
The options type defines various option members that influence the behaviour of all or some formats.
Definition: sam_file/input_options.hpp:27
The options type defines various option members that influence the behavior of all or some formats.
Definition: sam_file/output_options.hpp:26
Provides seqan3::views::take_exactly and seqan3::views::take_exactly_or_throw.
T tie(T... args)
T visit(T... args)