BamAlignment.h

Go to the documentation of this file.
00001 // ***************************************************************************
00002 // BamAlignment.h (c) 2009 Derek Barnett
00003 // Marth Lab, Department of Biology, Boston College
00004 // ---------------------------------------------------------------------------
00005 // Last modified: 16 October 2011 (DB)
00006 // ---------------------------------------------------------------------------
00007 // Provides the BamAlignment data structure
00008 // ***************************************************************************
00009 
00010 #ifndef BAMALIGNMENT_H
00011 #define BAMALIGNMENT_H
00012 
00013 #include "api/api_global.h"
00014 #include "api/BamAux.h"
00015 #include "api/BamConstants.h"
00016 #include <cstdlib>
00017 #include <cstring>
00018 #include <string>
00019 #include <vector>
00020 
00021 namespace BamTools {
00022 
00024 // forward declaration of BamAlignment's "friends"
00025 namespace Internal {
00026     class BamReaderPrivate;
00027     class BamWriterPrivate;
00028 } // namespace Internal
00030 
00031 // BamAlignment data structure
00032 struct API_EXPORT BamAlignment {
00033 
00034     // constructors & destructor
00035     public:
00036         BamAlignment(void);
00037         BamAlignment(const BamAlignment& other);
00038         ~BamAlignment(void);
00039 
00040     // queries against alignment flags
00041     public:        
00042         bool IsDuplicate(void) const;         // returns true if this read is a PCR duplicate
00043         bool IsFailedQC(void) const;          // returns true if this read failed quality control
00044         bool IsFirstMate(void) const;         // returns true if alignment is first mate on read
00045         bool IsMapped(void) const;            // returns true if alignment is mapped
00046         bool IsMateMapped(void) const;        // returns true if alignment's mate is mapped
00047         bool IsMateReverseStrand(void) const; // returns true if alignment's mate mapped to reverse strand
00048         bool IsPaired(void) const;            // returns true if alignment part of paired-end read
00049         bool IsPrimaryAlignment(void) const;  // returns true if reported position is primary alignment
00050         bool IsProperPair(void) const;        // returns true if alignment is part of read that satisfied paired-end resolution
00051         bool IsReverseStrand(void) const;     // returns true if alignment mapped to reverse strand
00052         bool IsSecondMate(void) const;        // returns true if alignment is second mate on read
00053 
00054     // manipulate alignment flags
00055     public:        
00056         void SetIsDuplicate(bool ok);         // sets value of "PCR duplicate" flag
00057         void SetIsFailedQC(bool ok);          // sets value of "failed quality control" flag
00058         void SetIsFirstMate(bool ok);         // sets value of "alignment is first mate" flag
00059         void SetIsMapped(bool ok);            // sets value of "alignment is mapped" flag
00060         void SetIsMateMapped(bool ok);        // sets value of "alignment's mate is mapped" flag
00061         void SetIsMateReverseStrand(bool ok); // sets value of "alignment's mate mapped to reverse strand" flag
00062         void SetIsPaired(bool ok);            // sets value of "alignment part of paired-end read" flag
00063         void SetIsPrimaryAlignment(bool ok);  // sets value of "position is primary alignment" flag
00064         void SetIsProperPair(bool ok);        // sets value of "alignment is part of read that satisfied paired-end resolution" flag
00065         void SetIsReverseStrand(bool ok);     // sets value of "alignment mapped to reverse strand" flag
00066         void SetIsSecondMate(bool ok);        // sets value of "alignment is second mate on read" flag
00067 
00068     // tag data access methods
00069     public:
00070 
00071         // add a new tag
00072         template<typename T> bool AddTag(const std::string& tag, const std::string& type, const T& value);
00073         template<typename T> bool AddTag(const std::string& tag, const std::vector<T>& values);
00074 
00075         // edit (or append) tag
00076         template<typename T> bool EditTag(const std::string& tag, const std::string& type, const T& value);
00077         template<typename T> bool EditTag(const std::string& tag, const std::vector<T>& values);
00078 
00079         // retrieves tag data
00080         template<typename T> bool GetTag(const std::string& tag, T& destination) const;
00081         template<typename T> bool GetTag(const std::string& tag, std::vector<T>& destination) const;
00082 
00083         // retrieves the SAM/BAM type-code for requested tag name
00084         bool GetTagType(const std::string& tag, char& type) const;
00085 
00086         // returns true if alignment has a record for this tag name
00087         bool HasTag(const std::string& tag) const;
00088 
00089         // removes a tag
00090         void RemoveTag(const std::string& tag);
00091 
00092     // additional methods
00093     public:
00094         // populates alignment string fields
00095         bool BuildCharData(void);
00096 
00097         // calculates alignment end position
00098         int GetEndPosition(bool usePadded = false, bool closedInterval = false) const;
00099 
00100         // returns a description of the last error that occurred
00101         std::string GetErrorString(void) const;
00102 
00103         // retrieves the size, read locations and reference locations of soft-clip operations
00104         bool GetSoftClips(std::vector<int>& clipSizes,
00105                           std::vector<int>& readPositions,
00106                           std::vector<int>& genomePositions,
00107                           bool usePadded = false) const;
00108 
00109     // public data fields
00110     public:
00111         std::string Name;               // read name
00112         int32_t     Length;             // length of query sequence
00113         std::string QueryBases;         // 'original' sequence (as reported from sequencing machine)
00114         std::string AlignedBases;       // 'aligned' sequence (includes any indels, padding, clipping)
00115         std::string Qualities;          // FASTQ qualities (ASCII characters, not numeric values)
00116         std::string TagData;            // tag data (use provided methods to query/modify)
00117         int32_t     RefID;              // ID number for reference sequence
00118         int32_t     Position;           // position (0-based) where alignment starts
00119         uint16_t    Bin;                // BAM (standard) index bin number for this alignment
00120         uint16_t    MapQuality;         // mapping quality score
00121         uint32_t    AlignmentFlag;      // alignment bit-flag (use provided methods to query/modify)
00122         std::vector<CigarOp> CigarData; // CIGAR operations for this alignment
00123         int32_t     MateRefID;          // ID number for reference sequence where alignment's mate was aligned
00124         int32_t     MatePosition;       // position (0-based) where alignment's mate starts
00125         int32_t     InsertSize;         // mate-pair insert size
00126         std::string Filename;           // name of BAM file which this alignment comes from
00127 
00129     // internal utility methods
00130     private:
00131         bool FindTag(const std::string& tag,
00132                      char*& pTagData,
00133                      const unsigned int& tagDataLength,
00134                      unsigned int& numBytesParsed) const;
00135         bool IsValidSize(const std::string& tag, const std::string& type) const;
00136         void SetErrorString(const std::string& where, const std::string& what) const;
00137         bool SkipToNextTag(const char storageType,
00138                            char*& pTagData,
00139                            unsigned int& numBytesParsed) const;
00140 
00141     // internal data
00142     private:
00143 
00144         struct BamAlignmentSupportData {
00145       
00146             // data members
00147             std::string AllCharData;
00148             uint32_t    BlockLength;
00149             uint32_t    NumCigarOperations;
00150             uint32_t    QueryNameLength;
00151             uint32_t    QuerySequenceLength;
00152             bool        HasCoreOnly;
00153             
00154             // constructor
00155             BamAlignmentSupportData(void)
00156                 : BlockLength(0)
00157                 , NumCigarOperations(0)
00158                 , QueryNameLength(0)
00159                 , QuerySequenceLength(0)
00160                 , HasCoreOnly(false)
00161             { }
00162         };
00163         BamAlignmentSupportData SupportData;
00164         friend class Internal::BamReaderPrivate;
00165         friend class Internal::BamWriterPrivate;
00166 
00167         mutable std::string ErrorString; // mutable to allow updates even in logically const methods
00169 };
00170 
00171 // ---------------------------------------------------------
00172 // BamAlignment tag access methods
00173 
00185 template<typename T>
00186 inline bool BamAlignment::AddTag(const std::string& tag, const std::string& type, const T& value) {
00187 
00188     // if char data not populated, do that first
00189     if ( SupportData.HasCoreOnly )
00190         BuildCharData();
00191 
00192     // check tag/type size
00193     if ( !IsValidSize(tag, type) ) {
00194         // TODO: set error string?
00195         return false;
00196     }
00197 
00198     // check that storage type code is OK for T
00199     if ( !TagTypeHelper<T>::CanConvertTo(type.at(0)) ) {
00200         // TODO: set error string?
00201         return false;
00202     }
00203 
00204     // localize the tag data
00205     char* pTagData = (char*)TagData.data();
00206     const unsigned int tagDataLength = TagData.size();
00207     unsigned int numBytesParsed = 0;
00208 
00209     // if tag already exists, return false
00210     // use EditTag explicitly instead
00211     if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
00212         // TODO: set error string?
00213         return false;
00214     }
00215 
00216     // otherwise, convert value to string
00217     union { T value; char valueBuffer[sizeof(T)]; } un;
00218     un.value = value;
00219 
00220     // copy original tag data to temp buffer
00221     const std::string newTag = tag + type;
00222     const size_t newTagDataLength = tagDataLength + newTag.size() + sizeof(T); // leave room for new T
00223     RaiiBuffer originalTagData(newTagDataLength);
00224     memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1);    // '+1' for TagData null-term
00225 
00226     // append newTag
00227     strcat(originalTagData.Buffer + tagDataLength, newTag.data());
00228     memcpy(originalTagData.Buffer + tagDataLength + newTag.size(), un.valueBuffer, sizeof(T));
00229 
00230     // store temp buffer back in TagData
00231     const char* newTagData = (const char*)originalTagData.Buffer;
00232     TagData.assign(newTagData, newTagDataLength);
00233     return true;
00234 }
00235 
00236 template<>
00237 inline bool BamAlignment::AddTag<std::string>(const std::string& tag,
00238                                               const std::string& type,
00239                                               const std::string& value)
00240 {
00241     // if char data not populated, do that first
00242     if ( SupportData.HasCoreOnly )
00243         BuildCharData();
00244 
00245     // check tag/type size
00246     if ( !IsValidSize(tag, type) ) {
00247         // TODO: set error string?
00248         return false;
00249     }
00250 
00251     // check that storage type code is OK for string
00252     if ( !TagTypeHelper<std::string>::CanConvertTo(type.at(0)) ) {
00253         // TODO: set error string?
00254         return false;
00255     }
00256 
00257     // localize the tag data
00258     char* pTagData = (char*)TagData.data();
00259     const unsigned int tagDataLength = TagData.size();
00260     unsigned int numBytesParsed = 0;
00261 
00262     // if tag already exists, return false
00263     // use EditTag explicitly instead
00264     if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
00265         // TODO: set error string?
00266         return false;
00267     }
00268 
00269     // otherwise, copy tag data to temp buffer
00270     const std::string newTag = tag + type + value;
00271     const size_t newTagDataLength = tagDataLength + newTag.size() + 1; // leave room for null-term
00272     RaiiBuffer originalTagData(newTagDataLength);
00273     memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength + 1);    // '+1' for TagData null-term
00274 
00275     // append newTag (removes original null-term, then appends newTag + null-term)
00276     strcat(originalTagData.Buffer + tagDataLength, newTag.data());
00277 
00278     // store temp buffer back in TagData
00279     const char* newTagData = (const char*)originalTagData.Buffer;
00280     TagData.assign(newTagData, newTagDataLength);
00281     return true;
00282 }
00283 
00294 template<typename T>
00295 inline bool BamAlignment::AddTag(const std::string& tag, const std::vector<T>& values) {
00296 
00297     // if char data not populated, do that first
00298     if ( SupportData.HasCoreOnly )
00299         BuildCharData();
00300 
00301     // check for valid tag name length
00302     if ( tag.size() != Constants::BAM_TAG_TAGSIZE )
00303         return false;
00304 
00305     // localize the tag data
00306     char* pTagData = (char*)TagData.data();
00307     const unsigned int tagDataLength = TagData.size();
00308     unsigned int numBytesParsed = 0;
00309 
00310     // if tag already exists, return false
00311     // use EditTag explicitly instead
00312     if ( FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
00313         // TODO: set error string?
00314         return false;
00315     }
00316 
00317     // build new tag's base information
00318     char newTagBase[Constants::BAM_TAG_ARRAYBASE_SIZE];
00319     memcpy( newTagBase, tag.c_str(), Constants::BAM_TAG_TAGSIZE );
00320     newTagBase[2] = Constants::BAM_TAG_TYPE_ARRAY;
00321     newTagBase[3] = TagTypeHelper<T>::TypeCode();
00322 
00323     // add number of array elements to newTagBase
00324     const int32_t numElements  = values.size();
00325     memcpy(newTagBase + 4, &numElements, sizeof(int32_t));
00326 
00327     // copy current TagData string to temp buffer, leaving room for new tag's contents
00328     const size_t newTagDataLength = tagDataLength +
00329                                     Constants::BAM_TAG_ARRAYBASE_SIZE +
00330                                     numElements*sizeof(T);
00331     RaiiBuffer originalTagData(newTagDataLength);
00332     memcpy(originalTagData.Buffer, TagData.c_str(), tagDataLength+1); // '+1' for TagData's null-term
00333 
00334     // write newTagBase (removes old null term)
00335     strcat(originalTagData.Buffer + tagDataLength, (const char*)newTagBase);
00336 
00337     // add vector elements to tag
00338     int elementsBeginOffset = tagDataLength + Constants::BAM_TAG_ARRAYBASE_SIZE;
00339     for ( int i = 0 ; i < numElements; ++i ) {
00340         const T& value = values.at(i);
00341         memcpy(originalTagData.Buffer + elementsBeginOffset + i*sizeof(T), &value, sizeof(T));
00342     }
00343 
00344     // store temp buffer back in TagData
00345     const char* newTagData = (const char*)originalTagData.Buffer;
00346     TagData.assign(newTagData, newTagDataLength);
00347     return true;
00348 }
00349 
00364 template<typename T>
00365 inline bool BamAlignment::EditTag(const std::string& tag, const std::string& type, const T& value) {
00366 
00367     // if char data not populated, do that first
00368     if ( SupportData.HasCoreOnly )
00369         BuildCharData();
00370 
00371     // remove existing tag if present, then append tag with new value
00372     if ( HasTag(tag) )
00373         RemoveTag(tag);
00374     return AddTag(tag, type, value);
00375 }
00376 
00388 template<typename T>
00389 inline bool BamAlignment::EditTag(const std::string& tag, const std::vector<T>& values) {
00390 
00391     // if char data not populated, do that first
00392     if ( SupportData.HasCoreOnly )
00393         BuildCharData();
00394 
00395     // remove existing tag if present, then append tag with new values
00396     if ( HasTag(tag) )
00397         RemoveTag(tag);
00398     return AddTag(tag, values);
00399 }
00400 
00401 
00409 template<typename T>
00410 inline bool BamAlignment::GetTag(const std::string& tag, T& destination) const {
00411 
00412     // skip if alignment is core-only
00413     if ( SupportData.HasCoreOnly ) {
00414         // TODO: set error string?
00415         return false;
00416     }
00417 
00418     // skip if no tags present
00419     if ( TagData.empty() ) {
00420         // TODO: set error string?
00421         return false;
00422     }
00423 
00424     // localize the tag data
00425     char* pTagData = (char*)TagData.data();
00426     const unsigned int tagDataLength = TagData.size();
00427     unsigned int numBytesParsed = 0;
00428 
00429     // return failure if tag not found
00430     if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
00431         // TODO: set error string?
00432         return false;
00433     }
00434 
00435     // fetch data type
00436     const char type = *(pTagData - 1);
00437     if ( !TagTypeHelper<T>::CanConvertFrom(type) ) {
00438         // TODO: set error string ?
00439         return false;
00440     }
00441 
00442     // determine data length
00443     int destinationLength = 0;
00444     switch ( type ) {
00445 
00446         // 1 byte data
00447         case (Constants::BAM_TAG_TYPE_ASCII) :
00448         case (Constants::BAM_TAG_TYPE_INT8)  :
00449         case (Constants::BAM_TAG_TYPE_UINT8) :
00450             destinationLength = 1;
00451             break;
00452 
00453         // 2 byte data
00454         case (Constants::BAM_TAG_TYPE_INT16)  :
00455         case (Constants::BAM_TAG_TYPE_UINT16) :
00456             destinationLength = 2;
00457             break;
00458 
00459         // 4 byte data
00460         case (Constants::BAM_TAG_TYPE_INT32)  :
00461         case (Constants::BAM_TAG_TYPE_UINT32) :
00462         case (Constants::BAM_TAG_TYPE_FLOAT)  :
00463             destinationLength = 4;
00464             break;
00465 
00466         // var-length types not supported for numeric destination
00467         case (Constants::BAM_TAG_TYPE_STRING) :
00468         case (Constants::BAM_TAG_TYPE_HEX)    :
00469         case (Constants::BAM_TAG_TYPE_ARRAY)  :
00470             SetErrorString("BamAlignment::GetTag",
00471                            "cannot store variable length tag data into a numeric destination");
00472             return false;
00473 
00474         // unrecognized tag type
00475         default:
00476             const std::string message = std::string("invalid tag type: ") + type;
00477             SetErrorString("BamAlignment::GetTag", message);
00478             return false;
00479     }
00480 
00481     // store data in destination
00482     destination = 0;
00483     memcpy(&destination, pTagData, destinationLength);
00484 
00485     // return success
00486     return true;
00487 }
00488 
00489 template<>
00490 inline bool BamAlignment::GetTag<std::string>(const std::string& tag,
00491                                               std::string& destination) const
00492 {
00493     // skip if alignment is core-only
00494     if ( SupportData.HasCoreOnly ) {
00495         // TODO: set error string?
00496         return false;
00497     }
00498 
00499     // skip if no tags present
00500     if ( TagData.empty() ) {
00501         // TODO: set error string?
00502         return false;
00503     }
00504 
00505     // localize the tag data
00506     char* pTagData = (char*)TagData.data();
00507     const unsigned int tagDataLength = TagData.size();
00508     unsigned int numBytesParsed = 0;
00509 
00510     // return failure if tag not found
00511     if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
00512         // TODO: set error string?
00513         return false;
00514     }
00515 
00516     // otherwise copy data into destination
00517     const unsigned int dataLength = strlen(pTagData);
00518     destination.clear();
00519     destination.resize(dataLength);
00520     memcpy( (char*)destination.data(), pTagData, dataLength );
00521 
00522     // return success
00523     return true;
00524 }
00525 
00533 template<typename T>
00534 inline bool BamAlignment::GetTag(const std::string& tag, std::vector<T>& destination) const {
00535 
00536     // skip if alignment is core-only
00537     if ( SupportData.HasCoreOnly ) {
00538         // TODO: set error string?
00539         return false;
00540     }
00541 
00542     // skip if no tags present
00543     if ( TagData.empty() ) {
00544         // TODO: set error string?
00545         return false;
00546     }
00547 
00548     // localize the tag data
00549     char* pTagData = (char*)TagData.data();
00550     const unsigned int tagDataLength = TagData.size();
00551     unsigned int numBytesParsed = 0;
00552 
00553     // return false if tag not found
00554     if ( !FindTag(tag, pTagData, tagDataLength, numBytesParsed) ) {
00555         // TODO: set error string?
00556         return false;
00557     }
00558 
00559     // check that tag is array type
00560     const char tagType = *(pTagData - 1);
00561     if ( tagType != Constants::BAM_TAG_TYPE_ARRAY ) {
00562         SetErrorString("BamAlignment::GetTag", "cannot store a non-array tag in array destination");
00563         return false;
00564     }
00565 
00566     // fetch element type
00567     const char elementType = *pTagData;
00568     if ( !TagTypeHelper<T>::CanConvertFrom(elementType) ) {
00569         // TODO: set error string ?
00570         return false;
00571     }
00572     ++pTagData;
00573 
00574     // calculate length of each element in tag's array
00575     int elementLength = 0;
00576     switch ( elementType ) {
00577         case (Constants::BAM_TAG_TYPE_ASCII) :
00578         case (Constants::BAM_TAG_TYPE_INT8)  :
00579         case (Constants::BAM_TAG_TYPE_UINT8) :
00580             elementLength = sizeof(uint8_t);
00581             break;
00582 
00583         case (Constants::BAM_TAG_TYPE_INT16)  :
00584         case (Constants::BAM_TAG_TYPE_UINT16) :
00585             elementLength = sizeof(uint16_t);
00586             break;
00587 
00588         case (Constants::BAM_TAG_TYPE_INT32)  :
00589         case (Constants::BAM_TAG_TYPE_UINT32) :
00590         case (Constants::BAM_TAG_TYPE_FLOAT)  :
00591             elementLength = sizeof(uint32_t);
00592             break;
00593 
00594         // var-length types not supported for numeric destination
00595         case (Constants::BAM_TAG_TYPE_STRING) :
00596         case (Constants::BAM_TAG_TYPE_HEX)    :
00597         case (Constants::BAM_TAG_TYPE_ARRAY)  :
00598             SetErrorString("BamAlignment::GetTag",
00599                            "invalid array data, variable-length elements are not allowed");
00600             return false;
00601 
00602         // unknown tag type
00603         default:
00604             const std::string message = std::string("invalid array element type: ") + elementType;
00605             SetErrorString("BamAlignment::GetTag", message);
00606             return false;
00607     }
00608 
00609     // get number of elements
00610     int32_t numElements;
00611     memcpy(&numElements, pTagData, sizeof(int32_t));
00612     pTagData += 4;
00613     destination.clear();
00614     destination.reserve(numElements);
00615 
00616     // read in elements
00617     T value;
00618     for ( int i = 0 ; i < numElements; ++i ) {
00619         memcpy(&value, pTagData, sizeof(T));
00620         pTagData += sizeof(T);
00621         destination.push_back(value);
00622     }
00623 
00624     // return success
00625     return true;
00626 }
00627 
00628 typedef std::vector<BamAlignment> BamAlignmentVector;
00629 
00630 } // namespace BamTools
00631 
00632 #endif // BAMALIGNMENT_H
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Defines
Generated on Wed Aug 29 17:43:46 2012 for BamTools by  doxygen 1.6.3