| // Copyright 2016 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| syntax = "proto3"; |
| |
| package google.genomics.v1; |
| |
| import "google/api/annotations.proto"; |
| import "google/genomics/v1/cigar.proto"; |
| import "google/genomics/v1/position.proto"; |
| import "google/protobuf/struct.proto"; |
| |
| option cc_enable_arenas = true; |
| option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics"; |
| option java_multiple_files = true; |
| option java_outer_classname = "ReadAlignmentProto"; |
| option java_package = "com.google.genomics.v1"; |
| |
| |
| // A linear alignment can be represented by one CIGAR string. Describes the |
| // mapped position and local alignment of the read to the reference. |
| message LinearAlignment { |
| // The position of this alignment. |
| Position position = 1; |
| |
| // The mapping quality of this alignment. Represents how likely |
| // the read maps to this position as opposed to other locations. |
| // |
| // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to |
| // the nearest integer. |
| int32 mapping_quality = 2; |
| |
| // Represents the local alignment of this sequence (alignment matches, indels, |
| // etc) against the reference. |
| repeated CigarUnit cigar = 3; |
| } |
| |
| // A read alignment describes a linear alignment of a string of DNA to a |
| // [reference sequence][google.genomics.v1.Reference], in addition to metadata |
| // about the fragment (the molecule of DNA sequenced) and the read (the bases |
| // which were read by the sequencer). A read is equivalent to a line in a SAM |
| // file. A read belongs to exactly one read group and exactly one |
| // [read group set][google.genomics.v1.ReadGroupSet]. |
| // |
| // For more genomics resource definitions, see [Fundamentals of Google |
| // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics) |
| // |
| // ### Reverse-stranded reads |
| // |
| // Mapped reads (reads having a non-null `alignment`) can be aligned to either |
| // the forward or the reverse strand of their associated reference. Strandedness |
| // of a mapped read is encoded by `alignment.position.reverseStrand`. |
| // |
| // If we consider the reference to be a forward-stranded coordinate space of |
| // `[0, reference.length)` with `0` as the left-most position and |
| // `reference.length` as the right-most position, reads are always aligned left |
| // to right. That is, `alignment.position.position` always refers to the |
| // left-most reference coordinate and `alignment.cigar` describes the alignment |
| // of this read to the reference from left to right. All per-base fields such as |
| // `alignedSequence` and `alignedQuality` share this same left-to-right |
| // orientation; this is true of reads which are aligned to either strand. For |
| // reverse-stranded reads, this means that `alignedSequence` is the reverse |
| // complement of the bases that were originally reported by the sequencing |
| // machine. |
| // |
| // ### Generating a reference-aligned sequence string |
| // |
| // When interacting with mapped reads, it's often useful to produce a string |
| // representing the local alignment of the read to reference. The following |
| // pseudocode demonstrates one way of doing this: |
| // |
| // out = "" |
| // offset = 0 |
| // for c in read.alignment.cigar { |
| // switch c.operation { |
| // case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH": |
| // out += read.alignedSequence[offset:offset+c.operationLength] |
| // offset += c.operationLength |
| // break |
| // case "CLIP_SOFT", "INSERT": |
| // offset += c.operationLength |
| // break |
| // case "PAD": |
| // out += repeat("*", c.operationLength) |
| // break |
| // case "DELETE": |
| // out += repeat("-", c.operationLength) |
| // break |
| // case "SKIP": |
| // out += repeat(" ", c.operationLength) |
| // break |
| // case "CLIP_HARD": |
| // break |
| // } |
| // } |
| // return out |
| // |
| // ### Converting to SAM's CIGAR string |
| // |
| // The following pseudocode generates a SAM CIGAR string from the |
| // `cigar` field. Note that this is a lossy conversion |
| // (`cigar.referenceSequence` is lost). |
| // |
| // cigarMap = { |
| // "ALIGNMENT_MATCH": "M", |
| // "INSERT": "I", |
| // "DELETE": "D", |
| // "SKIP": "N", |
| // "CLIP_SOFT": "S", |
| // "CLIP_HARD": "H", |
| // "PAD": "P", |
| // "SEQUENCE_MATCH": "=", |
| // "SEQUENCE_MISMATCH": "X", |
| // } |
| // cigarStr = "" |
| // for c in read.alignment.cigar { |
| // cigarStr += c.operationLength + cigarMap[c.operation] |
| // } |
| // return cigarStr |
| message Read { |
| // The server-generated read ID, unique across all reads. This is different |
| // from the `fragmentName`. |
| string id = 1; |
| |
| // The ID of the read group this read belongs to. A read belongs to exactly |
| // one read group. This is a server-generated ID which is distinct from SAM's |
| // RG tag (for that value, see |
| // [ReadGroup.name][google.genomics.v1.ReadGroup.name]). |
| string read_group_id = 2; |
| |
| // The ID of the read group set this read belongs to. A read belongs to |
| // exactly one read group set. |
| string read_group_set_id = 3; |
| |
| // The fragment name. Equivalent to QNAME (query template name) in SAM. |
| string fragment_name = 4; |
| |
| // The orientation and the distance between reads from the fragment are |
| // consistent with the sequencing protocol (SAM flag 0x2). |
| bool proper_placement = 5; |
| |
| // The fragment is a PCR or optical duplicate (SAM flag 0x400). |
| bool duplicate_fragment = 6; |
| |
| // The observed length of the fragment, equivalent to TLEN in SAM. |
| int32 fragment_length = 7; |
| |
| // The read number in sequencing. 0-based and less than numberReads. This |
| // field replaces SAM flag 0x40 and 0x80. |
| int32 read_number = 8; |
| |
| // The number of reads in the fragment (extension to SAM flag 0x1). |
| int32 number_reads = 9; |
| |
| // Whether this read did not pass filters, such as platform or vendor quality |
| // controls (SAM flag 0x200). |
| bool failed_vendor_quality_checks = 10; |
| |
| // The linear alignment for this alignment record. This field is null for |
| // unmapped reads. |
| LinearAlignment alignment = 11; |
| |
| // Whether this alignment is secondary. Equivalent to SAM flag 0x100. |
| // A secondary alignment represents an alternative to the primary alignment |
| // for this read. Aligners may return secondary alignments if a read can map |
| // ambiguously to multiple coordinates in the genome. By convention, each read |
| // has one and only one alignment where both `secondaryAlignment` |
| // and `supplementaryAlignment` are false. |
| bool secondary_alignment = 12; |
| |
| // Whether this alignment is supplementary. Equivalent to SAM flag 0x800. |
| // Supplementary alignments are used in the representation of a chimeric |
| // alignment. In a chimeric alignment, a read is split into multiple |
| // linear alignments that map to different reference contigs. The first |
| // linear alignment in the read will be designated as the representative |
| // alignment; the remaining linear alignments will be designated as |
| // supplementary alignments. These alignments may have different mapping |
| // quality scores. In each linear alignment in a chimeric alignment, the read |
| // will be hard clipped. The `alignedSequence` and |
| // `alignedQuality` fields in the alignment record will only |
| // represent the bases for its respective linear alignment. |
| bool supplementary_alignment = 13; |
| |
| // The bases of the read sequence contained in this alignment record, |
| // **without CIGAR operations applied** (equivalent to SEQ in SAM). |
| // `alignedSequence` and `alignedQuality` may be |
| // shorter than the full read sequence and quality. This will occur if the |
| // alignment is part of a chimeric alignment, or if the read was trimmed. When |
| // this occurs, the CIGAR for this read will begin/end with a hard clip |
| // operator that will indicate the length of the excised sequence. |
| string aligned_sequence = 14; |
| |
| // The quality of the read sequence contained in this alignment record |
| // (equivalent to QUAL in SAM). |
| // `alignedSequence` and `alignedQuality` may be shorter than the full read |
| // sequence and quality. This will occur if the alignment is part of a |
| // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR |
| // for this read will begin/end with a hard clip operator that will indicate |
| // the length of the excised sequence. |
| repeated int32 aligned_quality = 15; |
| |
| // The mapping of the primary alignment of the |
| // `(readNumber+1)%numberReads` read in the fragment. It replaces |
| // mate position and mate strand in SAM. |
| Position next_mate_position = 16; |
| |
| // A map of additional read alignment information. This must be of the form |
| // map<string, string[]> (string key mapping to a list of string values). |
| map<string, google.protobuf.ListValue> info = 17; |
| } |