19 #include "SamFileHeader.h" 20 #include "SamRecord.h" 21 #include "BamInterface.h" 22 #include "SamInterface.h" 23 #include "BgzfFileType.h" 48 init(filename, mode, NULL);
58 init(filename, mode, NULL);
67 init(filename, mode, header);
78 init(filename, mode, header);
100 while (filename[lastchar] != 0) lastchar++;
103 if((lastchar >= 1) && (filename[0] ==
'-'))
107 if(strcmp(filename,
"-.bam") == 0)
123 ifread(myFilePtr, magic, 4);
125 else if(strcmp(filename,
"-.ubam") == 0)
135 #ifdef __ZLIB_AVAILABLE__ 136 BgzfFileType::setRequireEofBlock(
false);
144 ifread(myFilePtr, magic, 4);
146 else if((strcmp(filename,
"-") == 0) || (strcmp(filename,
"-.sam") == 0))
156 std::string errorMessage =
"Invalid SAM/BAM filename, ";
157 errorMessage += filename;
158 errorMessage +=
". From stdin, can only be '-', '-.sam', '-.bam', or '-.ubam'";
178 std::string errorMessage =
"Failed to Open ";
179 errorMessage += filename;
180 errorMessage +=
" for reading";
188 ifread(myFilePtr, magic, 4);
190 if (magic[0] ==
'B' && magic[1] ==
'A' && magic[2] ==
'M' &&
229 while (filename[lastchar] != 0) lastchar++;
231 filename[lastchar - 4] ==
'u' &&
232 filename[lastchar - 3] ==
'b' &&
233 filename[lastchar - 2] ==
'a' &&
234 filename[lastchar - 1] ==
'm')
238 if((lastchar == 6) && (filename[0] ==
'-') && (filename[1] ==
'.'))
247 else if (lastchar >= 3 &&
248 filename[lastchar - 3] ==
'b' &&
249 filename[lastchar - 2] ==
'a' &&
250 filename[lastchar - 1] ==
'm')
254 if((lastchar == 5) && (filename[0] ==
'-') && (filename[1] ==
'.'))
267 if((lastchar >= 1) && (filename[0] ==
'-'))
276 if (myFilePtr == NULL)
278 std::string errorMessage =
"Failed to Open ";
279 errorMessage += filename;
280 errorMessage +=
" for writing";
303 if(myBamIndex != NULL)
315 std::string errorMessage =
"Failed to read the bam Index file: ";
316 errorMessage += bamIndexFilename;
330 if(myFilePtr == NULL)
334 std::string errorMessage =
"Failed to read the bam Index file -" 335 " the BAM file needs to be read first in order to determine" 336 " the index filename.";
341 const char* bamBaseName = myFilePtr->
getFileName();
343 std::string indexName = bamBaseName;
346 bool foundFile =
true;
354 catch (std::exception&)
364 size_t startExt = indexName.find(
".bam");
365 if(startExt == std::string::npos)
372 indexName.erase(startExt, 4);
382 myRefPtr = reference;
389 myReadTranslation = translation;
396 myWriteTranslation = translation;
412 if (myFilePtr != NULL)
415 return(myFilePtr->
isOpen());
426 if (myFilePtr != NULL)
429 return(
ifeof(myFilePtr));
444 "Cannot read header since the file is not open for reading");
452 "Cannot read header since it has already been read.");
456 if(myInterfacePtr->readHeader(myFilePtr, header,
myStatus))
476 "Cannot write header since the file is not open for writing");
484 "Cannot write header since it has already been written");
488 if(myInterfacePtr->writeHeader(myFilePtr, header,
myStatus))
510 "Cannot read record since the file is not open for reading");
511 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a SAM/BAM record prior to opening the file."));
520 "Cannot read record since the header has not been read.");
521 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a SAM/BAM record prior to reading the header."));
529 if(!processNewSection(header))
550 if(!ensureIndexedReadPosition())
560 myInterfacePtr->readRecord(myFilePtr, header, record,
myStatus);
572 if(!checkRecordInSection(record))
580 uint16_t flag = record.
getFlag();
581 if((flag & myRequiredFlags) != myRequiredFlags)
587 if((flag & myExcludedFlags) != 0)
626 "Cannot write record since the file is not open for writing");
634 "Cannot write record since the header has not been written");
643 "Cannot write the record since the file is not properly sorted.");
654 myStatus = myInterfacePtr->writeRecord(myFilePtr, header, record,
671 mySortedType = sortType;
710 "Cannot set section since there is no bam file open");
715 myOverlapSection = overlap;
720 myChunksToRead.clear();
723 myCurrentChunkEnd = 0;
729 myPrevReadName.Clear();
746 "Cannot set section since there is no bam file open");
751 myOverlapSection = overlap;
754 if((strcmp(refName,
"") == 0) || (strcmp(refName,
"*") == 0))
766 myChunksToRead.clear();
769 myCurrentChunkEnd = 0;
775 myPrevReadName.Clear();
783 myRequiredFlags = requiredFlags;
784 myExcludedFlags = excludedFlags;
793 if(myBamIndex == NULL)
796 "Cannot get num mapped reads from the index until it has been read.");
808 if(myBamIndex == NULL)
811 "Cannot get num unmapped reads from the index until it has been read.");
824 if(myBamIndex == NULL)
827 "Cannot get num mapped reads from the index until it has been read.");
831 if((strcmp(refName,
"") != 0) && (strcmp(refName,
"*") != 0))
846 if(myBamIndex == NULL)
849 "Cannot get num unmapped reads from the index until it has been read.");
853 if((strcmp(refName,
"") != 0) && (strcmp(refName,
"*") != 0))
913 myInterfacePtr = NULL;
919 myAttemptRecovery =
false;
931 bool openStatus =
true;
946 std::cerr <<
"FAILURE - EXITING!!!" << std::endl;
956 if (myFilePtr != NULL)
962 if(myInterfacePtr != NULL)
964 delete myInterfacePtr;
965 myInterfacePtr = NULL;
972 myPrevReadName.Clear();
983 myNewSection =
false;
984 myOverlapSection =
true;
985 myCurrentChunkEnd = 0;
986 myChunksToRead.clear();
987 if(myBamIndex != NULL)
1008 if(myRefPtr != NULL)
1014 bool status =
false;
1023 if(mySortedType ==
FLAG)
1026 mySortedType = getSortOrderFromHeader(header);
1036 if((myPrevReadName.Compare(readName) > 0) &&
1037 (strcmp(myPrevReadName.c_str(), readName) > 0))
1040 String errorMessage =
"ERROR: File is not sorted by read name at record ";
1042 errorMessage +=
"\n\tPrevious record was ";
1043 errorMessage += myPrevReadName;
1044 errorMessage +=
", but this record is ";
1045 errorMessage += readName;
1047 errorMessage.c_str());
1052 myPrevReadName = readName;
1068 myPrevRefID = refID;
1075 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1077 errorMessage +=
"\n\tPrevious record was unmapped, but this record is ";
1080 errorMessage.c_str());
1083 else if(refID < myPrevRefID)
1087 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1089 errorMessage +=
"\n\tPrevious record was ";
1091 errorMessage +=
", but this record is ";
1094 errorMessage.c_str());
1100 if(refID > myPrevRefID)
1110 String errorMessage =
"ERROR: File is not coordinate sorted at record ";
1112 errorMessage +=
"\n\tPreviousRecord was ";
1114 errorMessage +=
", but this record is ";
1117 errorMessage.c_str());
1122 myPrevRefID = refID;
1141 if(strcmp(tag,
"queryname") == 0)
1145 else if(strcmp(tag,
"coordinate") == 0)
1149 return(headerSortOrder);
1153 bool SamFile::ensureIndexedReadPosition()
1165 uint64_t currentPos =
iftell(myFilePtr);
1166 if(currentPos >= myCurrentChunkEnd)
1169 if(myChunksToRead.empty())
1176 Chunk newChunk = myChunksToRead.pop();
1180 if(newChunk.chunk_beg != currentPos)
1183 if(
ifseek(myFilePtr, newChunk.chunk_beg, SEEK_SET) !=
true)
1187 "Failed to seek to the next record");
1192 myCurrentChunkEnd = newChunk.chunk_end;
1198 bool SamFile::checkRecordInSection(
SamRecord& record)
1200 bool recordFound =
true;
1235 recordFound =
false;
1238 if(!myOverlapSection)
1247 ((myEndPos != -1) &&
1252 recordFound =
false;
1256 return(recordFound);
1262 myNewSection =
false;
1265 if(myBamIndex == NULL)
1269 "Cannot read section since there is no index file open");
1270 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section prior to opening the BAM Index file."));
1279 "Cannot read section since there is no bam file open");
1280 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section without opening a BAM file."));
1288 "Cannot read record since the header has not been read.");
1289 throw(std::runtime_error(
"SOFTWARE BUG: trying to read a BAM record by section prior to opening the header."));
1299 myChunksToRead.clear();
1302 myCurrentChunkEnd = 0;
1306 if(!myRefName.empty())
1322 myChunksToRead) ==
true)
1328 String errorMsg =
"Failed to get the specified region, refID = ";
1329 errorMsg += myRefID;
1330 errorMsg +=
"; startPos = ";
1331 errorMsg += myStartPos;
1332 errorMsg +=
"; endPos = ";
1333 errorMsg += myEndPos;
1352 bool SamFile::attemptRecoverySync(
bool (*checkSignature)(
void *data) ,
int length)
1354 if(myFilePtr==NULL)
return false;
1356 return myFilePtr->attemptRecoverySync(checkSignature, length);
1395 :
SamFile(filename,
READ, errorHandlingType, header)
1400 SamFileReader::~SamFileReader()
1444 SamFileWriter::~SamFileWriter()
void SetWriteSequenceTranslation(SamRecord::SequenceTranslation translation)
Set the type of sequence translation to use when writing the sequence.
static const int32_t REF_ID_ALL
The number used to indicate that all reference ids should be used.
uint32_t GetNumOverlaps(SamRecord &samRecord)
Returns the number of bases in the passed in read that overlap the region that is currently set...
int32_t getNumUnMappedReads(int32_t refID)
Get the number of unmapped reads for this reference id.
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
SamStatistics * myStatistics
Pointer to the statistics for this file.
bool ReadHeader(SamFileHeader &header)
Reads the header section from the file and stores it in the passed in header.
SequenceTranslation
Enum containing the settings on how to translate the sequence if a reference is available.
void SetReadSequenceTranslation(SamRecord::SequenceTranslation translation)
Set the type of sequence translation to use when reading the sequence.
bool validateSortOrder(SamRecord &record, SamFileHeader &header)
Validate that the record is sorted compared to the previously read record if there is one...
const BamIndex * GetBamIndex()
Return the bam index if one has been opened.
void Close()
Close the file if there is one open.
static const int NO_REF_ID
Constant for the value returned if a reference id does not exist for a queried reference name...
void SetReference(GenomeSequence *reference)
Sets the reference to the specified genome sequence object.
method failed due to an I/O issue.
OpenType
Enum for indicating whether to open the file for read or write.
SamFile()
Default Constructor, initializes the variables, but does not open any files.
int32_t getNumMappedReadsFromIndex(int32_t refID)
Get the number of mapped reads in the specified reference id.
uint32_t getNumOverlaps(int32_t start, int32_t end)
Return the number of bases in this read that overlap the passed in region.
uint32_t GetCurrentRecordCount()
Return the number of records that have been read/written so far.
SamFileReader()
Default Constructor.
const char * getReadName()
Returns the SAM formatted Read Name (QNAME).
SamFileWriter()
Default Constructor.
int32_t getReferenceID()
Get the reference sequence id of the record (BAM format rid).
SamStatus::Status readIndex(const char *filename)
method completed successfully.
failed to parse a record/header - invalid format.
int32_t getNumMappedReads(int32_t refID)
Get the number of mapped reads for this reference id.
bool SetReadSection(int32_t refID)
Sets which reference id (index into the BAM list of reference information) of the BAM file should be ...
void setStatus(Status newStatus, const char *newMessage)
Set the status with the specified status enum and message.
void setSortedValidation(SortedType sortType)
Set the flag to validate that the file is sorted as it is read/written.
int32_t getNumUnMappedReadsFromIndex(int32_t refID)
Get the number of unmapped reads in the specified reference id.
virtual ~SamFile()
Destructor.
bool ReadBamIndex()
Read the bam index file using the BAM filename as a base.
NO_MORE_RECS: failed to read a record since there are no more to read either in the file or section i...
void setSequenceTranslation(SequenceTranslation translation)
Set the type of sequence translation to use when getting the sequence.
void setReference(GenomeSequence *reference)
Set the reference to the specified genome sequence object.
file is sorted by coordinate.
bool myIsOpenForWrite
Flag to indicate if a file is open for writing.
Allows the user to easily read/write a SAM/BAM file.
bool getChunksForRegion(int32_t refID, int32_t start, int32_t end, SortedChunkList &chunkList)
Get the list of chunks associated with this region.
bool IsOpen()
Returns whether or not the file has been opened successfully.
bool WriteHeader(SamFileHeader &header)
Writes the specified header into the file.
Leave the sequence as is.
SO flag from the header indicates the sort type.
HandlingType
This specifies how this class should respond to errors.
int32_t myPrevCoord
Previous values used for checking if the file is sorted.
SamStatus myStatus
The status of the last SamFile command.
bool WriteRecord(SamFileHeader &header, SamRecord &record)
Writes the specified record into the file.
uint32_t myRecordCount
Keep a count of the number of records that have been read/written so far.
const char * GetStatusMessage()
Get the Status Message of the last call that sets status.
void resetFile()
Resets the file prepping for a new file.
FAIL_ORDER: method failed because it was called out of order, like trying to read a file without open...
bool IsEOF()
Returns whether or not the end of the file has been reached.
file is sorted by queryname.
bool ReadRecord(SamFileHeader &header, SamRecord &record)
Reads the next record from the file & stores it in the passed in record.
Create/Access/Modify/Load Genome Sequences stored as binary mapped files.
bool OpenForWrite(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for writing with the specified filename, determining SAM/BAM from the extension (...
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record...
void GenerateStatistics(bool genStats)
Whether or not statistics should be generated for this file.
bool OpenForRead(const char *filename, SamFileHeader *header=NULL)
Open a sam/bam file for reading with the specified filename, determing the type of file and SAM/BAM b...
SortedType
Enum for indicating the type of sort expected in the file.
uint16_t getFlag()
Get the flag (FLAG).
record is invalid due to it not being sorted.
bool myIsOpenForRead
Flag to indicate if a file is open for reading.
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Status
Return value enum for StatGenFile methods.
bool myIsBamOpenForRead
Values for reading Sorted BAM files via the index.
void SetReadFlags(uint16_t requiredFlags, uint16_t excludedFlags)
Specify which reads should be returned by ReadRecord.
bool myHasHeader
Flag to indicate if a header has been read/written - required before being able to read/write a recor...
static const int32_t REF_ID_UNMAPPED
The number used for the reference id of unmapped reads.