21 #include "FastQFile.h" 22 #include "BaseUtilities.h" 36 myMinReadLength(minReadLength),
37 myNumPrintableErrors(numPrintableErrors),
39 myDisableMessages(false),
49 myDisableMessages =
true;
55 myDisableMessages =
false;
78 myMaxErrors = maxErrors;
91 myQualPerCycle.clear();
92 myCountPerCycle.clear();
103 myFile =
ifopen(fileName,
"rt");
104 myFileName = fileName;
116 std::string errorMessage =
"ERROR: Failed to open file: ";
117 errorMessage += fileName;
118 logMessage(errorMessage.c_str());
144 std::string errorMessage =
"Failed to close file: ";
145 errorMessage += myFileName.c_str();
146 logMessage(errorMessage.c_str());
156 if((myFile != NULL) && (myFile->
isOpen()))
171 if((myFile != NULL) && (
ifeof(myFile)))
186 if(
isEof() || myFileProblem)
208 int numSequences = 0;
216 ((myMaxErrors == -1) || (myMaxErrors > myNumErrors)))
237 myBaseComposition.
print();
245 std::string finishMessage =
"Finished processing ";
246 finishMessage += myFileName.c_str();
249 " with %u lines containing %d sequences.",
250 myLineNum, numSequences) > 0)
252 finishMessage += buffer;
253 logMessage(finishMessage.c_str());
256 "There were a total of %d errors.",
271 else if(myNumErrors == 0)
276 if(numSequences == 0)
279 logMessage(
"ERROR: No FastQSequences in the file.");
304 std::string message =
305 "ERROR: Trying to read a fastq file but no file is open.";
306 logMessage(message.c_str());
311 resetForEachSequence();
322 valid = validateSequenceIdentifierLine();
335 if(mySequenceIdLine.Length() != 0)
339 myErrorString =
"Incomplete Sequence.\n";
364 valid &= validateRawSequenceAndPlusLines();
382 myErrorString =
"Incomplete Sequence, missing Quality String.";
389 valid &= validateQualityStringLines();
405 bool FastQFile::validateSequenceIdentifierLine()
408 int readStatus = mySequenceIdLine.ReadLine(myFile);
418 myFileProblem =
true;
419 myErrorString =
"Failure trying to read sequence identifier line";
426 if((mySequenceIdLine.Length() == 0) && (
ifeof(myFile)))
437 if(mySequenceIdLine.Length() < 2)
440 myErrorString =
"The sequence identifier line was too short.";
446 if(mySequenceIdLine[0] !=
'@')
449 myErrorString =
"First line of a sequence does not begin with @";
460 int endSequenceIdentifier = mySequenceIdLine.FastFindChar(
' ', 1);
463 if(endSequenceIdentifier == -1)
467 mySequenceIdentifier = (mySequenceIdLine.SubStr(1)).c_str();
474 if(endSequenceIdentifier <= 1)
477 "No Sequence Identifier specified before the comment.";
482 mySequenceIdentifier =
483 (mySequenceIdLine.SubStr(1, endSequenceIdentifier - 1)).c_str();
491 std::pair<std::map<std::string, unsigned int>::iterator,
bool> insertResult;
493 myIdentifierMap.insert(std::make_pair(mySequenceIdentifier.c_str(),
496 if(insertResult.second ==
false)
499 myErrorString =
"Repeated Sequence Identifier: ";
500 myErrorString += mySequenceIdentifier.c_str();
501 myErrorString +=
" at Lines ";
502 myErrorString += insertResult.first->second;
503 myErrorString +=
" and ";
504 myErrorString += myLineNum;
520 bool FastQFile::validateRawSequenceAndPlusLines()
523 int readStatus = myRawSequence.ReadLine(myFile);
529 myFileProblem =
true;
530 myErrorString =
"Failure trying to read sequence line";
539 bool valid = validateRawSequence(offset);
542 offset = myRawSequence.Length();
547 bool stillRawLine =
true;
548 while(stillRawLine &&
560 readStatus = myPlusLine.ReadLine(myFile);
565 myFileProblem =
true;
566 myErrorString =
"Failure trying to read sequence/plus line";
572 if(myPlusLine.Length() == 0)
577 "Looking for continuation of Raw Sequence or '+' instead found a blank line, assuming it was part of Raw Sequence.";
581 else if(myPlusLine[0] ==
'+')
584 valid &= validateSequencePlus();
585 stillRawLine =
false;
592 myRawSequence += myPlusLine;
593 myPlusLine.SetLength(0);
594 valid &= validateRawSequence(offset);
597 offset = myRawSequence.Length();
609 if(myRawSequence.Length() < myMinReadLength)
612 myErrorString =
"Raw Sequence is shorter than the min read length: ";
613 myErrorString += myRawSequence.Length();
614 myErrorString +=
" < ";
615 myErrorString += myMinReadLength;
630 myErrorString =
"Reached the end of the file without a '+' line.";
640 bool FastQFile::validateQualityStringLines()
643 int readStatus = myQualityString.ReadLine(myFile);
648 myFileProblem =
true;
649 myErrorString =
"Failure trying to read quality line";
658 bool valid = validateQualityString(offset);
660 offset = myQualityString.Length();
664 while((myQualityString.Length() < myRawSequence.Length()) &&
674 readStatus = myTempPartialQuality.ReadLine(myFile);
679 myFileProblem =
true;
680 myErrorString =
"Failure trying to read quality line";
685 myQualityString += myTempPartialQuality;
686 myTempPartialQuality.Clear();
689 valid &= validateQualityString(offset);
690 offset = myQualityString.Length();
701 if(myQualityString.Length() != myRawSequence.Length())
703 myErrorString =
"Quality string length (";
704 myErrorString += myQualityString.Length();
705 myErrorString +=
") does not equal raw sequence length (";
706 myErrorString += myRawSequence.Length();
707 myErrorString +=
")";
716 bool FastQFile::validateRawSequence(
int offset)
718 bool validBase =
false;
722 for(
int sequenceIndex = offset; sequenceIndex < myRawSequence.Length();
729 myRawSequence[sequenceIndex]);
734 myErrorString =
"Invalid character ('";
735 myErrorString += myRawSequence[sequenceIndex];
736 myErrorString +=
"') in base sequence.";
752 bool FastQFile::validateSequencePlus()
758 int lineLength = myPlusLine.Length();
763 if((lineLength == 1) || (myPlusLine[1] ==
' '))
775 int sequenceIdentifierLength = mySequenceIdentifier.Length();
776 if(lineLength <= sequenceIdentifierLength)
779 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
791 while((same ==
true) && (seqIndex < sequenceIdentifierLength))
793 if(myPlusLine[lineIndex] != mySequenceIdentifier[seqIndex])
796 "Sequence Identifier on '+' line does not equal the one on the '@' line.";
808 bool FastQFile::validateQualityString(
int offset)
811 if(myQualityString.Length() > (int)(myQualPerCycle.size()))
813 myQualPerCycle.resize(myQualityString.Length());
814 myCountPerCycle.resize(myQualityString.Length());
817 for(
int i=offset; i < myQualityString.Length(); i++)
819 if(myQualityString[i] <= 32)
821 myErrorString =
"Invalid character ('";
822 myErrorString += myQualityString[i];
823 myErrorString +=
"') in quality string.";
835 myCountPerCycle[i] += 1;
845 void FastQFile::reportErrorOnLine()
851 if(myNumErrors <= myNumPrintableErrors)
855 sprintf(buffer,
"ERROR on Line %u: ", myLineNum);
856 std::string message = buffer;
857 message += myErrorString.c_str();
858 logMessage(message.c_str());
864 void FastQFile::reset()
868 resetForEachSequence();
871 myFileName.SetLength(0);
872 myIdentifierMap.clear();
873 myBaseComposition.
clear();
874 myQualPerCycle.clear();
875 myCountPerCycle.clear();
876 myFileProblem =
false;
881 void FastQFile::resetForEachSequence()
883 myLineBuffer.SetLength(0);
884 myErrorString.SetLength(0);
885 myRawSequence.SetLength(0);
886 mySequenceIdLine.SetLength(0);
887 mySequenceIdentifier.SetLength(0);
888 myPlusLine.SetLength(0);
889 myQualityString.SetLength(0);
890 myTempPartialQuality.SetLength(0);
894 void FastQFile::logMessage(
const char* logMessage)
897 if(!myDisableMessages)
899 std::cout << logMessage << std::endl;
906 bool FastQFile::isTimeToQuit()
910 if((myMaxErrors != -1) && (myNumErrors >= myMaxErrors))
918 void FastQFile::printAvgQual()
920 std::cout << std::endl <<
"Average Phred Quality by Read Index (starts at 0):" << std::endl;
921 std::cout.precision(2);
922 std::cout << std::fixed <<
"Read Index\tAverage Quality" 924 if(myQualPerCycle.size() != myCountPerCycle.size())
927 std::cerr <<
"ERROR calculating the average Qualities per cycle\n";
933 for(
unsigned int i = 0; i < myQualPerCycle.size(); i++)
936 if(myCountPerCycle[i] != 0)
938 avgQual = myQualPerCycle[i] / (double)(myCountPerCycle[i]);
940 std::cout << i <<
"\t" << avgQual <<
"\n";
941 sumQual += myQualPerCycle[i];
942 count += myCountPerCycle[i];
944 std::cout << std::endl;
948 avgQual = sumQual / count;
950 std::cout <<
"Overall Average Phred Quality = " << avgQual << std::endl;
void disableMessages()
Disable messages - do not write to cout.
bool isOpen()
Check to see if the file is open.
FastQFile(int minReadLength=10, int numPrintableErrors=20)
Constructor.
void print()
Print the composition.
bool keepReadingFile()
Returns whether or not to keep reading the file, it stops reading (false) if eof or there is a proble...
means the file could not be closed.
bool isEof()
Check to see if the file is at the end of the file.
FastQStatus::Status readFastQSequence()
Read 1 FastQSequence, validating it.
void disableSeqIDCheck()
Disable Unique Sequence ID checking (Unique Sequence ID checking is enabled by default).
bool updateComposition(unsigned int rawSequenceCharIndex, char baseChar)
Update the composition for the specified index with the specified character.
void setBaseMapType(BaseAsciiMap::SPACE_TYPE spaceType)
Set the base map type for this composition.
FastQStatus::Status openFile(const char *fileName, BaseAsciiMap::SPACE_TYPE spaceType=BaseAsciiMap::UNKNOWN)
Open a FastQFile.
means the methods are called out of order, like trying to read a file before opening it...
means that a problem occurred on a read.
FastQStatus::Status validateFastQFile(const String &filename, bool printBaseComp, BaseAsciiMap::SPACE_TYPE spaceType, bool printQualAvg=false)
Validate the specified fastq file.
means the file could not be opened.
SPACE_TYPE
The type of space (color or base) to use in the mapping.
means that the sequence was invalid.
static uint8_t getPhredBaseQuality(char charQuality)
Get phred base quality from the specified ascii quality.
Status
Return value enum for the FastQFile class methods, indicating success or error codes.
FastQStatus::Status closeFile()
Close a FastQFile.
indicates method finished successfully.
void resetBaseMapType()
Reset the base map type for this composition.
void setMaxErrors(int maxErrors)
Set the number of errors after which to quit reading/validating a file, defaults to -1...
void enableSeqIDCheck()
Enable Unique Sequence ID checking.
means there were no errors, but no sequences read.
void enableMessages()
Enable messages - write to cout.
void clear()
Clear the composition stored in the base count vector.