libStatGen Software  1
CigarHelper.cpp
1 /*
2  * Copyright (C) 2011 Regents of the University of Michigan
3  *
4  * This program is free software: you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License as published by
6  * the Free Software Foundation, either version 3 of the License, or
7  * (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program. If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 //////////////////////////////////////////////////////////////////////////
19 
20 #include "CigarHelper.h"
21 
22 // Soft Clip from the beginning of the read to the specified reference position.
24  int32_t refPosition0Based,
25  CigarRoller& newCigar,
26  int32_t &new0BasedPosition)
27 {
28  newCigar.clear();
29  Cigar* cigar = record.getCigarInfo();
30  if(cigar == NULL)
31  {
32  // Failed to get the cigar.
33  ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
34  return(NO_CLIP);
35  }
36 
37  // No cigar or position in the record, so return no clip.
38  if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
39  {
40  return(NO_CLIP);
41  }
42 
43  // Check to see if the reference position occurs before the record starts,
44  // if it does, do no clipping.
45  if(refPosition0Based < record.get0BasedPosition())
46  {
47  // Not within this read, so nothing to clip.
48  newCigar.Set(record.getCigar());
49  return(NO_CLIP);
50  }
51 
52  // The position falls after the read starts, so loop through until the
53  // position or the end of the read is found.
54  int32_t readClipPosition = 0;
55  bool clipWritten = false;
56  new0BasedPosition = record.get0BasedPosition();
57  for(int i = 0; i < cigar->size(); i++)
58  {
59  const Cigar::CigarOperator* op = &(cigar->getOperator(i));
60 
61  if(clipWritten)
62  {
63  // Clip point has been found, so just add everything.
64  newCigar += *op;
65  // Go to the next operation.
66  continue;
67  }
68 
69  // The clip point has not yet been found, so check to see if we found
70  // it now.
71 
72  // Not a clip, check to see if the operation is found in the
73  // reference.
75  {
76  // match, mismatch, deletion, skip
77 
78  // increment the current reference position to just past this
79  // operation.
80  new0BasedPosition += op->count;
81 
82  // Check to see if this is also in the query, because otherwise
83  // the operation is still being consumed.
84  if(Cigar::foundInQuery(*op))
85  {
86  // Also in the query, determine if the entire thing should
87  // be clipped or just part of it.
88 
89  uint32_t numKeep = 0;
90  // Check to see if we have hit our clip position.
91  if(refPosition0Based < new0BasedPosition)
92  {
93  // The specified clip position is in this cigar operation.
94  numKeep = new0BasedPosition - refPosition0Based - 1;
95 
96  if(numKeep > op->count)
97  {
98  // Keep the entire read. This happens because
99  // we keep reading until the first match/mismatch
100  // after the clip.
101  numKeep = op->count;
102  }
103  }
104 
105  // Add the part of this operation that is being clipped
106  // to the clip count.
107  readClipPosition += (op->count - numKeep);
108 
109  // Only write the clip if we found a match/mismatch
110  // to write. Otherwise we will keep accumulating clips
111  // for the case of insertions.
112  if(numKeep > 0)
113  {
114  new0BasedPosition -= numKeep;
115 
116  newCigar.Add(Cigar::softClip, readClipPosition);
117 
118  // Add the clipped part of this cigar to the clip
119  // position.
120  newCigar.Add(op->operation, numKeep);
121 
122  // Found a match after the clip point, so stop
123  // consuming cigar operations.
124  clipWritten = true;
125  continue;
126  }
127  }
128  }
129  else
130  {
131  // Only add hard clips. The softclips will be added in
132  // when the total number is found.
133  if(op->operation == Cigar::hardClip)
134  {
135  // Check if this is the first operation, if so, just write it.
136  if(i == 0)
137  {
138  newCigar += *op;
139  }
140  // Check if it is the last operation (otherwise skip it).
141  else if(i == (cigar->size() - 1))
142  {
143  // Check whether or not the clip was ever written, and if
144  // not, write it.
145  if(clipWritten == false)
146  {
147  newCigar.Add(Cigar::softClip, readClipPosition);
148  // Since no match/mismatch was ever found, set
149  // the new ref position to the original one.
150  new0BasedPosition = record.get0BasedPosition();
151  clipWritten = true;
152  }
153  // Add the hard clip.
154  newCigar += *op;
155  }
156  }
157  // Not yet to the clip position, so do not add this operation.
158  if(Cigar::foundInQuery(*op))
159  {
160  // Found in the query, so update the read clip position.
161  readClipPosition += op->count;
162  }
163  }
164  } // End loop through cigar.
165 
166 
167  // Check whether or not the clip was ever written, and if
168  // not, write it.
169  if(clipWritten == false)
170  {
171  newCigar.Add(Cigar::softClip, readClipPosition);
172  // Since no match/mismatch was ever found, set
173  // the new ref position to the original one.
174  new0BasedPosition = record.get0BasedPosition();
175  }
176 
177  // Subtract 1 since readClipPosition atually contains the first 0based
178  // position that is not clipped.
179  return(readClipPosition - 1);
180 }
181 
182 
183 // Soft Clip from the end of the read at the specified reference position.
185  int32_t refPosition0Based,
186  CigarRoller& newCigar)
187 {
188  newCigar.clear();
189  Cigar* cigar = record.getCigarInfo();
190  if(cigar == NULL)
191  {
192  // Failed to get the cigar.
193  ErrorHandler::handleError("Soft clipping, but failed to read the cigar");
194  return(NO_CLIP);
195  }
196 
197  // No cigar or position in the record, so return no clip.
198  if((cigar->size() == 0) || (record.get0BasedPosition() == -1))
199  {
200  return(NO_CLIP);
201  }
202 
203  // Check to see if the reference position occurs after the record ends,
204  // if so, do no clipping.
205  if(refPosition0Based > record.get0BasedAlignmentEnd())
206  {
207  // Not within this read, so nothing to clip.
208  newCigar.Set(record.getCigar());
209  return(NO_CLIP);
210  }
211 
212  // The position falls before the read ends, so loop through until the
213  // position is found.
214  int32_t currentRefPosition = record.get0BasedPosition();
215  int32_t readClipPosition = 0;
216  for(int i = 0; i < cigar->size(); i++)
217  {
218  const Cigar::CigarOperator* op = &(cigar->getOperator(i));
219 
220  // If the operation is found in the reference, increase the
221  // reference position.
222  if(Cigar::foundInReference(*op))
223  {
224  // match, mismatch, deletion, skip
225  // increment the current reference position to just past
226  // this operation.
227  currentRefPosition += op->count;
228  }
229 
230  // Check to see if we have hit our clip position.
231  if(refPosition0Based < currentRefPosition)
232  {
233  // If this read is also in the query (match/mismatch),
234  // write the partial op to the new cigar.
235  int32_t numKeep = 0;
236  if(Cigar::foundInQuery(*op))
237  {
238  numKeep = op->count - (currentRefPosition - refPosition0Based);
239  if(numKeep > 0)
240  {
241  newCigar.Add(op->operation, numKeep);
242  readClipPosition += numKeep;
243  }
244  }
245  else if(Cigar::isClip(*op))
246  {
247  // This is a hard clip, so write it.
248  newCigar.Add(op->operation, op->count);
249  }
250  else
251  {
252 
253  // Not found in the query (skip/deletion),
254  // so don't write any of the operation.
255  }
256  // Found the clip point, so break.
257  break;
258  }
259  else if(refPosition0Based == currentRefPosition)
260  {
261  newCigar += *op;
262  if(Cigar::foundInQuery(*op))
263  {
264  readClipPosition += op->count;
265  }
266  }
267  else
268  {
269  // Not yet to the clip position, so add this operation/size to
270  // the new cigar.
271  newCigar += *op;
272  if(Cigar::foundInQuery(*op))
273  {
274  // Found in the query, so update the read clip position.
275  readClipPosition += op->count;
276  }
277  }
278  } // End loop through cigar.
279 
280  // Before adding the softclip, read from the end of the cigar checking to
281  // see if the operations are in the query, removing operations that are
282  // not (pad/delete/skip) until a hardclip or an operation in the query is
283  // found. We do not want a pad/delete/skip right before a softclip.
284  for(int j = newCigar.size() - 1; j >= 0; j--)
285  {
286  const Cigar::CigarOperator* op = &(newCigar.getOperator(j));
287  if(!Cigar::foundInQuery(*op) && !Cigar::isClip(*op))
288  {
289  // pad/delete/skip
290  newCigar.Remove(j);
291  }
292  else if(Cigar::foundInQuery(*op) & Cigar::isClip(*op))
293  {
294  // Soft clip, so increment the clip position for the return value.
295  // Remove the softclip since the readClipPosition is used to
296  // calculate teh size of the soft clip added.
297  readClipPosition -= op->count;
298  newCigar.Remove(j);
299  }
300  else
301  {
302  // Found a cigar operation that should not be deleted, so stop deleting.
303  break;
304  }
305  }
306 
307  // Determine the number of soft clips.
308  int32_t numSoftClips = record.getReadLength() - readClipPosition;
309  // NOTE that if the previous operation is a softclip, the CigarRoller logic
310  // will merge this with that one.
311  newCigar.Add(Cigar::softClip, numSoftClips);
312 
313  // Check if an ending hard clip needs to be added.
314  if(cigar->size() != 0)
315  {
316  const Cigar::CigarOperator* lastOp =
317  &(cigar->getOperator(cigar->size() - 1));
318  if(lastOp->operation == Cigar::hardClip)
319  {
320  newCigar += *lastOp;
321  }
322  }
323 
324  return(readClipPosition);
325 }
void Set(const char *cigarString)
Sets this object to the specified cigarString.
int32_t get0BasedAlignmentEnd()
Returns the 0-based inclusive rightmost position of the clipped sequence.
Definition: SamRecord.cpp:1455
static bool isClip(Operation op)
Return true if the specified operation is a clipping operation, false if not.
Definition: Cigar.h:261
void Add(Operation operation, int count)
Append the specified operation with the specified count to this object.
Definition: CigarRoller.cpp:77
static bool foundInReference(Operation op)
Return true if the specified operation is found in the reference sequence, false if not...
Definition: Cigar.h:177
static int32_t softClipEndByRefPos(SamRecord &record, int32_t refPosition0Based, CigarRoller &newCigar)
Soft clip the cigar from the back of the read at the specified reference position.
Cigar * getCigarInfo()
Returns a pointer to the Cigar object associated with this record.
Definition: SamRecord.cpp:1824
int size() const
Return the number of cigar operations.
Definition: Cigar.h:364
static int32_t softClipBeginByRefPos(SamRecord &record, int32_t refPosition0Based, CigarRoller &newCigar, int32_t &new0BasedPosition)
Soft clip the cigar from the beginning of the read at the specified reference position.
Definition: CigarHelper.cpp:23
Hard clip on the read (clipped sequence not present in the query sequence or reference). Associated with CIGAR Operation "H".
Definition: Cigar.h:95
int32_t getReadLength()
Get the length of the read.
Definition: SamRecord.cpp:1379
static bool foundInQuery(Operation op)
Return true if the specified operation is found in the query sequence, false if not.
Definition: Cigar.h:219
void clear()
Clear this object so that it has no Cigar Operations.
This class represents the CIGAR without any methods to set the cigar (see CigarRoller for that)...
Definition: Cigar.h:83
Soft clip on the read (clipped sequence present in the query sequence, but not in reference)...
Definition: Cigar.h:94
const CigarOperator & getOperator(int i) const
Return the Cigar Operation at the specified index (starting at 0).
Definition: Cigar.h:354
Class providing an easy to use interface to get/set/operate on the fields in a SAM/BAM record...
Definition: SamRecord.h:51
The purpose of this class is to provide accessors for setting, updating, modifying the CIGAR object...
Definition: CigarRoller.h:66
int32_t get0BasedPosition()
Get the 0-based(BAM) leftmost position of the record.
Definition: SamRecord.cpp:1307
static void handleError(const char *message, HandlingType handlingType=EXCEPTION)
Handle an error based on the error handling type.
bool Remove(int index)
Remove the operation at the specified index.
const char * getCigar()
Returns the SAM formatted CIGAR string.
Definition: SamRecord.cpp:1543