| // Copyright 2017 Google Inc. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| syntax = "proto3"; |
| |
| package google.assistant.embedded.v1alpha1; |
| |
| import "google/api/annotations.proto"; |
| import "google/rpc/status.proto"; |
| |
| option go_package = "google.golang.org/genproto/googleapis/assistant/embedded/v1alpha1;embedded"; |
| option java_multiple_files = true; |
| option java_outer_classname = "AssistantProto"; |
| option java_package = "com.google.assistant.embedded.v1alpha1"; |
| |
| |
| // Service that implements Google Assistant API. |
| service EmbeddedAssistant { |
| // Initiates or continues a conversation with the embedded assistant service. |
| // Each call performs one round-trip, sending an audio request to the service |
| // and receiving the audio response. Uses bidirectional streaming to receive |
| // results, such as the `END_OF_UTTERANCE` event, while sending audio. |
| // |
| // A conversation is one or more gRPC connections, each consisting of several |
| // streamed requests and responses. |
| // For example, the user says *Add to my shopping list* and the assistant |
| // responds *What do you want to add?*. The sequence of streamed requests and |
| // responses in the first gRPC message could be: |
| // |
| // * ConverseRequest.config |
| // * ConverseRequest.audio_in |
| // * ConverseRequest.audio_in |
| // * ConverseRequest.audio_in |
| // * ConverseRequest.audio_in |
| // * ConverseResponse.event_type.END_OF_UTTERANCE |
| // * ConverseResponse.result.microphone_mode.DIALOG_FOLLOW_ON |
| // * ConverseResponse.audio_out |
| // * ConverseResponse.audio_out |
| // * ConverseResponse.audio_out |
| // |
| // The user then says *bagels* and the assistant responds |
| // *OK, I've added bagels to your shopping list*. This is sent as another gRPC |
| // connection call to the `Converse` method, again with streamed requests and |
| // responses, such as: |
| // |
| // * ConverseRequest.config |
| // * ConverseRequest.audio_in |
| // * ConverseRequest.audio_in |
| // * ConverseRequest.audio_in |
| // * ConverseResponse.event_type.END_OF_UTTERANCE |
| // * ConverseResponse.result.microphone_mode.CLOSE_MICROPHONE |
| // * ConverseResponse.audio_out |
| // * ConverseResponse.audio_out |
| // * ConverseResponse.audio_out |
| // * ConverseResponse.audio_out |
| // |
| // Although the precise order of responses is not guaranteed, sequential |
| // ConverseResponse.audio_out messages will always contain sequential portions |
| // of audio. |
| rpc Converse(stream ConverseRequest) returns (stream ConverseResponse); |
| } |
| |
| // Specifies how to process the `ConverseRequest` messages. |
| message ConverseConfig { |
| // *Required* Specifies how to process the subsequent incoming audio. |
| AudioInConfig audio_in_config = 1; |
| |
| // *Required* Specifies how to format the audio that will be returned. |
| AudioOutConfig audio_out_config = 2; |
| |
| // *Required* Represents the current dialog state. |
| ConverseState converse_state = 3; |
| } |
| |
| // Specifies how to process the `audio_in` data that will be provided in |
| // subsequent requests. For recommended settings, see the Google Assistant SDK |
| // [best practices](https://developers.google.com/assistant/best-practices). |
| message AudioInConfig { |
| // Audio encoding of the data sent in the audio message. |
| // Audio must be one-channel (mono). The only language supported is "en-US". |
| enum Encoding { |
| // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. |
| ENCODING_UNSPECIFIED = 0; |
| |
| // Uncompressed 16-bit signed little-endian samples (Linear PCM). |
| // This encoding includes no header, only the raw audio bytes. |
| LINEAR16 = 1; |
| |
| // [`FLAC`](https://xiph.org/flac/documentation.html) (Free Lossless Audio |
| // Codec) is the recommended encoding because it is |
| // lossless--therefore recognition is not compromised--and |
| // requires only about half the bandwidth of `LINEAR16`. This encoding |
| // includes the `FLAC` stream header followed by audio data. It supports |
| // 16-bit and 24-bit samples, however, not all fields in `STREAMINFO` are |
| // supported. |
| FLAC = 2; |
| } |
| |
| // *Required* Encoding of audio data sent in all `audio_in` messages. |
| Encoding encoding = 1; |
| |
| // *Required* Sample rate (in Hertz) of the audio data sent in all `audio_in` |
| // messages. Valid values are from 16000-24000, but 16000 is optimal. |
| // For best results, set the sampling rate of the audio source to 16000 Hz. |
| // If that's not possible, use the native sample rate of the audio source |
| // (instead of re-sampling). |
| int32 sample_rate_hertz = 2; |
| } |
| |
| // Specifies the desired format for the server to use when it returns |
| // `audio_out` messages. |
| message AudioOutConfig { |
| // Audio encoding of the data returned in the audio message. All encodings are |
| // raw audio bytes with no header, except as indicated below. |
| enum Encoding { |
| // Not specified. Will return result [google.rpc.Code.INVALID_ARGUMENT][]. |
| ENCODING_UNSPECIFIED = 0; |
| |
| // Uncompressed 16-bit signed little-endian samples (Linear PCM). |
| LINEAR16 = 1; |
| |
| // MP3 audio encoding. The sample rate is encoded in the payload. |
| MP3 = 2; |
| |
| // Opus-encoded audio wrapped in an ogg container. The result will be a |
| // file which can be played natively on Android and in some browsers (such |
| // as Chrome). The quality of the encoding is considerably higher than MP3 |
| // while using the same bitrate. The sample rate is encoded in the payload. |
| OPUS_IN_OGG = 3; |
| } |
| |
| // *Required* The encoding of audio data to be returned in all `audio_out` |
| // messages. |
| Encoding encoding = 1; |
| |
| // *Required* The sample rate in Hertz of the audio data returned in |
| // `audio_out` messages. Valid values are: 16000-24000. |
| int32 sample_rate_hertz = 2; |
| |
| // *Required* Current volume setting of the device's audio output. |
| // Valid values are 1 to 100 (corresponding to 1% to 100%). |
| int32 volume_percentage = 3; |
| } |
| |
| // Provides information about the current dialog state. |
| message ConverseState { |
| // *Required* The `conversation_state` value returned in the prior |
| // `ConverseResponse`. Omit (do not set the field) if there was no prior |
| // `ConverseResponse`. If there was a prior `ConverseResponse`, do not omit |
| // this field; doing so will end that conversation (and this new request will |
| // start a new conversation). |
| bytes conversation_state = 1; |
| } |
| |
| // The audio containing the assistant's response to the query. Sequential chunks |
| // of audio data are received in sequential `ConverseResponse` messages. |
| message AudioOut { |
| // *Output-only* The audio data containing the assistant's response to the |
| // query. Sequential chunks of audio data are received in sequential |
| // `ConverseResponse` messages. |
| bytes audio_data = 1; |
| } |
| |
| // The semantic result for the user's spoken query. |
| message ConverseResult { |
| // Possible states of the microphone after a `Converse` RPC completes. |
| enum MicrophoneMode { |
| // No mode specified. |
| MICROPHONE_MODE_UNSPECIFIED = 0; |
| |
| // The service is not expecting a follow-on question from the user. |
| // The microphone should remain off until the user re-activates it. |
| CLOSE_MICROPHONE = 1; |
| |
| // The service is expecting a follow-on question from the user. The |
| // microphone should be re-opened when the `AudioOut` playback completes |
| // (by starting a new `Converse` RPC call to send the new audio). |
| DIALOG_FOLLOW_ON = 2; |
| } |
| |
| // *Output-only* The recognized transcript of what the user said. |
| string spoken_request_text = 1; |
| |
| // *Output-only* The text of the assistant's spoken response. This is only |
| // returned for an IFTTT action. |
| string spoken_response_text = 2; |
| |
| // *Output-only* State information for subsequent `ConverseRequest`. This |
| // value should be saved in the client and returned in the |
| // `conversation_state` with the next `ConverseRequest`. (The client does not |
| // need to interpret or otherwise use this value.) There is no need to save |
| // this information across device restarts. |
| bytes conversation_state = 3; |
| |
| // *Output-only* Specifies the mode of the microphone after this `Converse` |
| // RPC is processed. |
| MicrophoneMode microphone_mode = 4; |
| |
| // *Output-only* Updated volume level. The value will be 0 or omitted |
| // (indicating no change) unless a voice command such as "Increase the volume" |
| // or "Set volume level 4" was recognized, in which case the value will be |
| // between 1 and 100 (corresponding to the new volume level of 1% to 100%). |
| // Typically, a client should use this volume level when playing the |
| // `audio_out` data, and retain this value as the current volume level and |
| // supply it in the `AudioOutConfig` of the next `ConverseRequest`. (Some |
| // clients may also implement other ways to allow the current volume level to |
| // be changed, for example, by providing a knob that the user can turn.) |
| int32 volume_percentage = 5; |
| } |
| |
| // The top-level message sent by the client. Clients must send at least two, and |
| // typically numerous `ConverseRequest` messages. The first message must |
| // contain a `config` message and must not contain `audio_in` data. All |
| // subsequent messages must contain `audio_in` data and must not contain a |
| // `config` message. |
| message ConverseRequest { |
| // Exactly one of these fields must be specified in each `ConverseRequest`. |
| oneof converse_request { |
| // The `config` message provides information to the recognizer that |
| // specifies how to process the request. |
| // The first `ConverseRequest` message must contain a `config` message. |
| ConverseConfig config = 1; |
| |
| // The audio data to be recognized. Sequential chunks of audio data are sent |
| // in sequential `ConverseRequest` messages. The first `ConverseRequest` |
| // message must not contain `audio_in` data and all subsequent |
| // `ConverseRequest` messages must contain `audio_in` data. The audio bytes |
| // must be encoded as specified in `AudioInConfig`. |
| // Audio must be sent at approximately real-time (16000 samples per second). |
| // An error will be returned if audio is sent significantly faster or |
| // slower. |
| bytes audio_in = 2; |
| } |
| } |
| |
| // The top-level message received by the client. A series of one or more |
| // `ConverseResponse` messages are streamed back to the client. |
| message ConverseResponse { |
| // Indicates the type of event. |
| enum EventType { |
| // No event specified. |
| EVENT_TYPE_UNSPECIFIED = 0; |
| |
| // This event indicates that the server has detected the end of the user's |
| // speech utterance and expects no additional speech. Therefore, the server |
| // will not process additional audio (although it may subsequently return |
| // additional results). The client should stop sending additional audio |
| // data, half-close the gRPC connection, and wait for any additional results |
| // until the server closes the gRPC connection. |
| END_OF_UTTERANCE = 1; |
| } |
| |
| // Exactly one of these fields will be populated in each `ConverseResponse`. |
| oneof converse_response { |
| // *Output-only* If set, returns a [google.rpc.Status][google.rpc.Status] message that |
| // specifies the error for the operation. |
| // If an error occurs during processing, this message will be set and there |
| // will be no further messages sent. |
| google.rpc.Status error = 1; |
| |
| // *Output-only* Indicates the type of event. |
| EventType event_type = 2; |
| |
| // *Output-only* The audio containing the assistant's response to the query. |
| AudioOut audio_out = 3; |
| |
| // *Output-only* The semantic result for the user's spoken query. |
| ConverseResult result = 5; |
| } |
| } |