third_party/googleapis/google/genomics/v1/readalignment.proto - bazel - Git at Google

 // Copyright 2016 Google Inc.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //     http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 syntax = "proto3";

 package google.genomics.v1;

 import "google/api/annotations.proto";
 import "google/genomics/v1/cigar.proto";
 import "google/genomics/v1/position.proto";
 import "google/protobuf/struct.proto";

 option cc_enable_arenas = true;
 option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
 option java_multiple_files = true;
 option java_outer_classname = "ReadAlignmentProto";
 option java_package = "com.google.genomics.v1";


 // A linear alignment can be represented by one CIGAR string. Describes the
 // mapped position and local alignment of the read to the reference.
 message LinearAlignment {
   // The position of this alignment.
   Position position = 1;

   // The mapping quality of this alignment. Represents how likely
   // the read maps to this position as opposed to other locations.
   //
   // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
   // the nearest integer.
   int32 mapping_quality = 2;

   // Represents the local alignment of this sequence (alignment matches, indels,
   // etc) against the reference.
   repeated CigarUnit cigar = 3;
 }

 // A read alignment describes a linear alignment of a string of DNA to a
 // [reference sequence][google.genomics.v1.Reference], in addition to metadata
 // about the fragment (the molecule of DNA sequenced) and the read (the bases
 // which were read by the sequencer). A read is equivalent to a line in a SAM
 // file. A read belongs to exactly one read group and exactly one
 // [read group set][google.genomics.v1.ReadGroupSet].
 //
 // For more genomics resource definitions, see [Fundamentals of Google
 // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
 //
 // ### Reverse-stranded reads
 //
 // Mapped reads (reads having a non-null `alignment`) can be aligned to either
 // the forward or the reverse strand of their associated reference. Strandedness
 // of a mapped read is encoded by `alignment.position.reverseStrand`.
 //
 // If we consider the reference to be a forward-stranded coordinate space of
 // `[0, reference.length)` with `0` as the left-most position and
 // `reference.length` as the right-most position, reads are always aligned left
 // to right. That is, `alignment.position.position` always refers to the
 // left-most reference coordinate and `alignment.cigar` describes the alignment
 // of this read to the reference from left to right. All per-base fields such as
 // `alignedSequence` and `alignedQuality` share this same left-to-right
 // orientation; this is true of reads which are aligned to either strand. For
 // reverse-stranded reads, this means that `alignedSequence` is the reverse
 // complement of the bases that were originally reported by the sequencing
 // machine.
 //
 // ### Generating a reference-aligned sequence string
 //
 // When interacting with mapped reads, it's often useful to produce a string
 // representing the local alignment of the read to reference. The following
 // pseudocode demonstrates one way of doing this:
 //
 //     out = ""
 //     offset = 0
 //     for c in read.alignment.cigar {
 //       switch c.operation {
 //       case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
 //         out += read.alignedSequence[offset:offset+c.operationLength]
 //         offset += c.operationLength
 //         break
 //       case "CLIP_SOFT", "INSERT":
 //         offset += c.operationLength
 //         break
 //       case "PAD":
 //         out += repeat("*", c.operationLength)
 //         break
 //       case "DELETE":
 //         out += repeat("-", c.operationLength)
 //         break
 //       case "SKIP":
 //         out += repeat(" ", c.operationLength)
 //         break
 //       case "CLIP_HARD":
 //         break
 //       }
 //     }
 //     return out
 //
 // ### Converting to SAM's CIGAR string
 //
 // The following pseudocode generates a SAM CIGAR string from the
 // `cigar` field. Note that this is a lossy conversion
 // (`cigar.referenceSequence` is lost).
 //
 //     cigarMap = {
 //       "ALIGNMENT_MATCH": "M",
 //       "INSERT": "I",
 //       "DELETE": "D",
 //       "SKIP": "N",
 //       "CLIP_SOFT": "S",
 //       "CLIP_HARD": "H",
 //       "PAD": "P",
 //       "SEQUENCE_MATCH": "=",
 //       "SEQUENCE_MISMATCH": "X",
 //     }
 //     cigarStr = ""
 //     for c in read.alignment.cigar {
 //       cigarStr += c.operationLength + cigarMap[c.operation]
 //     }
 //     return cigarStr
 message Read {
   // The server-generated read ID, unique across all reads. This is different
   // from the `fragmentName`.
   string id = 1;

   // The ID of the read group this read belongs to. A read belongs to exactly
   // one read group. This is a server-generated ID which is distinct from SAM's
   // RG tag (for that value, see
   // [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
   string read_group_id = 2;

   // The ID of the read group set this read belongs to. A read belongs to
   // exactly one read group set.
   string read_group_set_id = 3;

   // The fragment name. Equivalent to QNAME (query template name) in SAM.
   string fragment_name = 4;

   // The orientation and the distance between reads from the fragment are
   // consistent with the sequencing protocol (SAM flag 0x2).
   bool proper_placement = 5;

   // The fragment is a PCR or optical duplicate (SAM flag 0x400).
   bool duplicate_fragment = 6;

   // The observed length of the fragment, equivalent to TLEN in SAM.
   int32 fragment_length = 7;

   // The read number in sequencing. 0-based and less than numberReads. This
   // field replaces SAM flag 0x40 and 0x80.
   int32 read_number = 8;

   // The number of reads in the fragment (extension to SAM flag 0x1).
   int32 number_reads = 9;

   // Whether this read did not pass filters, such as platform or vendor quality
   // controls (SAM flag 0x200).
   bool failed_vendor_quality_checks = 10;

   // The linear alignment for this alignment record. This field is null for
   // unmapped reads.
   LinearAlignment alignment = 11;

   // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
   // A secondary alignment represents an alternative to the primary alignment
   // for this read. Aligners may return secondary alignments if a read can map
   // ambiguously to multiple coordinates in the genome. By convention, each read
   // has one and only one alignment where both `secondaryAlignment`
   // and `supplementaryAlignment` are false.
   bool secondary_alignment = 12;

   // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
   // Supplementary alignments are used in the representation of a chimeric
   // alignment. In a chimeric alignment, a read is split into multiple
   // linear alignments that map to different reference contigs. The first
   // linear alignment in the read will be designated as the representative
   // alignment; the remaining linear alignments will be designated as
   // supplementary alignments. These alignments may have different mapping
   // quality scores. In each linear alignment in a chimeric alignment, the read
   // will be hard clipped. The `alignedSequence` and
   // `alignedQuality` fields in the alignment record will only
   // represent the bases for its respective linear alignment.
   bool supplementary_alignment = 13;

   // The bases of the read sequence contained in this alignment record,
   // **without CIGAR operations applied** (equivalent to SEQ in SAM).
   // `alignedSequence` and `alignedQuality` may be
   // shorter than the full read sequence and quality. This will occur if the
   // alignment is part of a chimeric alignment, or if the read was trimmed. When
   // this occurs, the CIGAR for this read will begin/end with a hard clip
   // operator that will indicate the length of the excised sequence.
   string aligned_sequence = 14;

   // The quality of the read sequence contained in this alignment record
   // (equivalent to QUAL in SAM).
   // `alignedSequence` and `alignedQuality` may be shorter than the full read
   // sequence and quality. This will occur if the alignment is part of a
   // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
   // for this read will begin/end with a hard clip operator that will indicate
   // the length of the excised sequence.
   repeated int32 aligned_quality = 15;

   // The mapping of the primary alignment of the
   // `(readNumber+1)%numberReads` read in the fragment. It replaces
   // mate position and mate strand in SAM.
   Position next_mate_position = 16;

   // A map of additional read alignment information. This must be of the form
   // map<string, string[]> (string key mapping to a list of string values).
   map<string, google.protobuf.ListValue> info = 17;
 }
	// Copyright 2016 Google Inc.
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	syntax = "proto3";

	package google.genomics.v1;

	import "google/api/annotations.proto";
	import "google/genomics/v1/cigar.proto";
	import "google/genomics/v1/position.proto";
	import "google/protobuf/struct.proto";

	option cc_enable_arenas = true;
	option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
	option java_multiple_files = true;
	option java_outer_classname = "ReadAlignmentProto";
	option java_package = "com.google.genomics.v1";


	// A linear alignment can be represented by one CIGAR string. Describes the
	// mapped position and local alignment of the read to the reference.
	message LinearAlignment {
	// The position of this alignment.
	Position position = 1;

	// The mapping quality of this alignment. Represents how likely
	// the read maps to this position as opposed to other locations.
	//
	// Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
	// the nearest integer.
	int32 mapping_quality = 2;

	// Represents the local alignment of this sequence (alignment matches, indels,
	// etc) against the reference.
	repeated CigarUnit cigar = 3;
	}

	// A read alignment describes a linear alignment of a string of DNA to a
	// [reference sequence][google.genomics.v1.Reference], in addition to metadata
	// about the fragment (the molecule of DNA sequenced) and the read (the bases
	// which were read by the sequencer). A read is equivalent to a line in a SAM
	// file. A read belongs to exactly one read group and exactly one
	// [read group set][google.genomics.v1.ReadGroupSet].
	//
	// For more genomics resource definitions, see [Fundamentals of Google
	// Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
	//
	// ### Reverse-stranded reads
	//
	// Mapped reads (reads having a non-null `alignment`) can be aligned to either
	// the forward or the reverse strand of their associated reference. Strandedness
	// of a mapped read is encoded by `alignment.position.reverseStrand`.
	//
	// If we consider the reference to be a forward-stranded coordinate space of
	// `[0, reference.length)` with `0` as the left-most position and
	// `reference.length` as the right-most position, reads are always aligned left
	// to right. That is, `alignment.position.position` always refers to the
	// left-most reference coordinate and `alignment.cigar` describes the alignment
	// of this read to the reference from left to right. All per-base fields such as
	// `alignedSequence` and `alignedQuality` share this same left-to-right
	// orientation; this is true of reads which are aligned to either strand. For
	// reverse-stranded reads, this means that `alignedSequence` is the reverse
	// complement of the bases that were originally reported by the sequencing
	// machine.
	//
	// ### Generating a reference-aligned sequence string
	//
	// When interacting with mapped reads, it's often useful to produce a string
	// representing the local alignment of the read to reference. The following
	// pseudocode demonstrates one way of doing this:
	//
	// out = ""
	// offset = 0
	// for c in read.alignment.cigar {
	// switch c.operation {
	// case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
	// out += read.alignedSequence[offset:offset+c.operationLength]
	// offset += c.operationLength
	// break
	// case "CLIP_SOFT", "INSERT":
	// offset += c.operationLength
	// break
	// case "PAD":
	// out += repeat("*", c.operationLength)
	// break
	// case "DELETE":
	// out += repeat("-", c.operationLength)
	// break
	// case "SKIP":
	// out += repeat(" ", c.operationLength)
	// break
	// case "CLIP_HARD":
	// break
	// }
	// }
	// return out
	//
	// ### Converting to SAM's CIGAR string
	//
	// The following pseudocode generates a SAM CIGAR string from the
	// `cigar` field. Note that this is a lossy conversion
	// (`cigar.referenceSequence` is lost).
	//
	// cigarMap = {
	// "ALIGNMENT_MATCH": "M",
	// "INSERT": "I",
	// "DELETE": "D",
	// "SKIP": "N",
	// "CLIP_SOFT": "S",
	// "CLIP_HARD": "H",
	// "PAD": "P",
	// "SEQUENCE_MATCH": "=",
	// "SEQUENCE_MISMATCH": "X",
	// }
	// cigarStr = ""
	// for c in read.alignment.cigar {
	// cigarStr += c.operationLength + cigarMap[c.operation]
	// }
	// return cigarStr
	message Read {
	// The server-generated read ID, unique across all reads. This is different
	// from the `fragmentName`.
	string id = 1;

	// The ID of the read group this read belongs to. A read belongs to exactly
	// one read group. This is a server-generated ID which is distinct from SAM's
	// RG tag (for that value, see
	// [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
	string read_group_id = 2;

	// The ID of the read group set this read belongs to. A read belongs to
	// exactly one read group set.
	string read_group_set_id = 3;

	// The fragment name. Equivalent to QNAME (query template name) in SAM.
	string fragment_name = 4;

	// The orientation and the distance between reads from the fragment are
	// consistent with the sequencing protocol (SAM flag 0x2).
	bool proper_placement = 5;

	// The fragment is a PCR or optical duplicate (SAM flag 0x400).
	bool duplicate_fragment = 6;

	// The observed length of the fragment, equivalent to TLEN in SAM.
	int32 fragment_length = 7;

	// The read number in sequencing. 0-based and less than numberReads. This
	// field replaces SAM flag 0x40 and 0x80.
	int32 read_number = 8;

	// The number of reads in the fragment (extension to SAM flag 0x1).
	int32 number_reads = 9;

	// Whether this read did not pass filters, such as platform or vendor quality
	// controls (SAM flag 0x200).
	bool failed_vendor_quality_checks = 10;

	// The linear alignment for this alignment record. This field is null for
	// unmapped reads.
	LinearAlignment alignment = 11;

	// Whether this alignment is secondary. Equivalent to SAM flag 0x100.
	// A secondary alignment represents an alternative to the primary alignment
	// for this read. Aligners may return secondary alignments if a read can map
	// ambiguously to multiple coordinates in the genome. By convention, each read
	// has one and only one alignment where both `secondaryAlignment`
	// and `supplementaryAlignment` are false.
	bool secondary_alignment = 12;

	// Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
	// Supplementary alignments are used in the representation of a chimeric
	// alignment. In a chimeric alignment, a read is split into multiple
	// linear alignments that map to different reference contigs. The first
	// linear alignment in the read will be designated as the representative
	// alignment; the remaining linear alignments will be designated as
	// supplementary alignments. These alignments may have different mapping
	// quality scores. In each linear alignment in a chimeric alignment, the read
	// will be hard clipped. The `alignedSequence` and
	// `alignedQuality` fields in the alignment record will only
	// represent the bases for its respective linear alignment.
	bool supplementary_alignment = 13;

	// The bases of the read sequence contained in this alignment record,
	// without CIGAR operations applied (equivalent to SEQ in SAM).
	// `alignedSequence` and `alignedQuality` may be
	// shorter than the full read sequence and quality. This will occur if the
	// alignment is part of a chimeric alignment, or if the read was trimmed. When
	// this occurs, the CIGAR for this read will begin/end with a hard clip
	// operator that will indicate the length of the excised sequence.
	string aligned_sequence = 14;

	// The quality of the read sequence contained in this alignment record
	// (equivalent to QUAL in SAM).
	// `alignedSequence` and `alignedQuality` may be shorter than the full read
	// sequence and quality. This will occur if the alignment is part of a
	// chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
	// for this read will begin/end with a hard clip operator that will indicate
	// the length of the excised sequence.
	repeated int32 aligned_quality = 15;

	// The mapping of the primary alignment of the
	// `(readNumber+1)%numberReads` read in the fragment. It replaces
	// mate position and mate strand in SAM.
	Position next_mate_position = 16;

	// A map of additional read alignment information. This must be of the form
	// map<string, string[]> (string key mapping to a list of string values).
	map<string, google.protobuf.ListValue> info = 17;
	}