blob: c17f8aeb6fe0d74fe15ee1598c76eeb619fcfc23 [file] [log] [blame]
// Copyright 2016 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto3";
package google.cloud.vision.v1;
import "google/api/annotations.proto";
import "google/cloud/vision/v1/geometry.proto";
import "google/cloud/vision/v1/text_annotation.proto";
import "google/cloud/vision/v1/web_detection.proto";
import "google/rpc/status.proto";
import "google/type/color.proto";
import "google/type/latlng.proto";
option cc_enable_arenas = true;
option go_package = "google.golang.org/genproto/googleapis/cloud/vision/v1;vision";
option java_multiple_files = true;
option java_outer_classname = "ImageAnnotatorProto";
option java_package = "com.google.cloud.vision.v1";
// Service that performs Google Cloud Vision API detection tasks over client
// images, such as face, landmark, logo, label, and text detection. The
// ImageAnnotator service returns detected entities from the images.
service ImageAnnotator {
// Run image detection and annotation for a batch of images.
rpc BatchAnnotateImages(BatchAnnotateImagesRequest) returns (BatchAnnotateImagesResponse) {
option (google.api.http) = { post: "/v1/images:annotate" body: "*" };
}
}
// Users describe the type of Google Cloud Vision API tasks to perform over
// images by using *Feature*s. Each Feature indicates a type of image
// detection task to perform. Features encode the Cloud Vision API
// vertical to operate on and the number of top-scoring results to return.
message Feature {
// Type of image feature.
enum Type {
// Unspecified feature type.
TYPE_UNSPECIFIED = 0;
// Run face detection.
FACE_DETECTION = 1;
// Run landmark detection.
LANDMARK_DETECTION = 2;
// Run logo detection.
LOGO_DETECTION = 3;
// Run label detection.
LABEL_DETECTION = 4;
// Run OCR.
TEXT_DETECTION = 5;
// Run dense text document OCR. Takes precedence when both
// DOCUMENT_TEXT_DETECTION and TEXT_DETECTION are present.
DOCUMENT_TEXT_DETECTION = 11;
// Run computer vision models to compute image safe-search properties.
SAFE_SEARCH_DETECTION = 6;
// Compute a set of image properties, such as the image's dominant colors.
IMAGE_PROPERTIES = 7;
// Run crop hints.
CROP_HINTS = 9;
// Run web detection.
WEB_DETECTION = 10;
}
// The feature type.
Type type = 1;
// Maximum number of results of this type.
int32 max_results = 2;
}
// External image source (Google Cloud Storage image location).
message ImageSource {
// NOTE: For new code `image_uri` below is preferred.
// Google Cloud Storage image URI, which must be in the following form:
// `gs://bucket_name/object_name` (for details, see
// [Google Cloud Storage Request
// URIs](https://cloud.google.com/storage/docs/reference-uris)).
// NOTE: Cloud Storage object versioning is not supported.
string gcs_image_uri = 1;
// Image URI which supports:
// 1) Google Cloud Storage image URI, which must be in the following form:
// `gs://bucket_name/object_name` (for details, see
// [Google Cloud Storage Request
// URIs](https://cloud.google.com/storage/docs/reference-uris)).
// NOTE: Cloud Storage object versioning is not supported.
// 2) Publicly accessible image HTTP/HTTPS URL.
// This is preferred over the legacy `gcs_image_uri` above. When both
// `gcs_image_uri` and `image_uri` are specified, `image_uri` takes
// precedence.
string image_uri = 2;
}
// Client image to perform Google Cloud Vision API tasks over.
message Image {
// Image content, represented as a stream of bytes.
// Note: as with all `bytes` fields, protobuffers use a pure binary
// representation, whereas JSON representations use base64.
bytes content = 1;
// Google Cloud Storage image location. If both `content` and `source`
// are provided for an image, `content` takes precedence and is
// used to perform the image annotation request.
ImageSource source = 2;
}
// A face annotation object contains the results of face detection.
message FaceAnnotation {
// A face-specific landmark (for example, a face feature).
// Landmark positions may fall outside the bounds of the image
// if the face is near one or more edges of the image.
// Therefore it is NOT guaranteed that `0 <= x < width` or
// `0 <= y < height`.
message Landmark {
// Face landmark (feature) type.
// Left and right are defined from the vantage of the viewer of the image
// without considering mirror projections typical of photos. So, `LEFT_EYE`,
// typically, is the person's right eye.
enum Type {
// Unknown face landmark detected. Should not be filled.
UNKNOWN_LANDMARK = 0;
// Left eye.
LEFT_EYE = 1;
// Right eye.
RIGHT_EYE = 2;
// Left of left eyebrow.
LEFT_OF_LEFT_EYEBROW = 3;
// Right of left eyebrow.
RIGHT_OF_LEFT_EYEBROW = 4;
// Left of right eyebrow.
LEFT_OF_RIGHT_EYEBROW = 5;
// Right of right eyebrow.
RIGHT_OF_RIGHT_EYEBROW = 6;
// Midpoint between eyes.
MIDPOINT_BETWEEN_EYES = 7;
// Nose tip.
NOSE_TIP = 8;
// Upper lip.
UPPER_LIP = 9;
// Lower lip.
LOWER_LIP = 10;
// Mouth left.
MOUTH_LEFT = 11;
// Mouth right.
MOUTH_RIGHT = 12;
// Mouth center.
MOUTH_CENTER = 13;
// Nose, bottom right.
NOSE_BOTTOM_RIGHT = 14;
// Nose, bottom left.
NOSE_BOTTOM_LEFT = 15;
// Nose, bottom center.
NOSE_BOTTOM_CENTER = 16;
// Left eye, top boundary.
LEFT_EYE_TOP_BOUNDARY = 17;
// Left eye, right corner.
LEFT_EYE_RIGHT_CORNER = 18;
// Left eye, bottom boundary.
LEFT_EYE_BOTTOM_BOUNDARY = 19;
// Left eye, left corner.
LEFT_EYE_LEFT_CORNER = 20;
// Right eye, top boundary.
RIGHT_EYE_TOP_BOUNDARY = 21;
// Right eye, right corner.
RIGHT_EYE_RIGHT_CORNER = 22;
// Right eye, bottom boundary.
RIGHT_EYE_BOTTOM_BOUNDARY = 23;
// Right eye, left corner.
RIGHT_EYE_LEFT_CORNER = 24;
// Left eyebrow, upper midpoint.
LEFT_EYEBROW_UPPER_MIDPOINT = 25;
// Right eyebrow, upper midpoint.
RIGHT_EYEBROW_UPPER_MIDPOINT = 26;
// Left ear tragion.
LEFT_EAR_TRAGION = 27;
// Right ear tragion.
RIGHT_EAR_TRAGION = 28;
// Left eye pupil.
LEFT_EYE_PUPIL = 29;
// Right eye pupil.
RIGHT_EYE_PUPIL = 30;
// Forehead glabella.
FOREHEAD_GLABELLA = 31;
// Chin gnathion.
CHIN_GNATHION = 32;
// Chin left gonion.
CHIN_LEFT_GONION = 33;
// Chin right gonion.
CHIN_RIGHT_GONION = 34;
}
// Face landmark type.
Type type = 3;
// Face landmark position.
Position position = 4;
}
// The bounding polygon around the face. The coordinates of the bounding box
// are in the original image's scale, as returned in `ImageParams`.
// The bounding box is computed to "frame" the face in accordance with human
// expectations. It is based on the landmarker results.
// Note that one or more x and/or y coordinates may not be generated in the
// `BoundingPoly` (the polygon will be unbounded) if only a partial face
// appears in the image to be annotated.
BoundingPoly bounding_poly = 1;
// The `fd_bounding_poly` bounding polygon is tighter than the
// `boundingPoly`, and encloses only the skin part of the face. Typically, it
// is used to eliminate the face from any image analysis that detects the
// "amount of skin" visible in an image. It is not based on the
// landmarker results, only on the initial face detection, hence
// the <code>fd</code> (face detection) prefix.
BoundingPoly fd_bounding_poly = 2;
// Detected face landmarks.
repeated Landmark landmarks = 3;
// Roll angle, which indicates the amount of clockwise/anti-clockwise rotation
// of the face relative to the image vertical about the axis perpendicular to
// the face. Range [-180,180].
float roll_angle = 4;
// Yaw angle, which indicates the leftward/rightward angle that the face is
// pointing relative to the vertical plane perpendicular to the image. Range
// [-180,180].
float pan_angle = 5;
// Pitch angle, which indicates the upwards/downwards angle that the face is
// pointing relative to the image's horizontal plane. Range [-180,180].
float tilt_angle = 6;
// Detection confidence. Range [0, 1].
float detection_confidence = 7;
// Face landmarking confidence. Range [0, 1].
float landmarking_confidence = 8;
// Joy likelihood.
Likelihood joy_likelihood = 9;
// Sorrow likelihood.
Likelihood sorrow_likelihood = 10;
// Anger likelihood.
Likelihood anger_likelihood = 11;
// Surprise likelihood.
Likelihood surprise_likelihood = 12;
// Under-exposed likelihood.
Likelihood under_exposed_likelihood = 13;
// Blurred likelihood.
Likelihood blurred_likelihood = 14;
// Headwear likelihood.
Likelihood headwear_likelihood = 15;
}
// Detected entity location information.
message LocationInfo {
// lat/long location coordinates.
google.type.LatLng lat_lng = 1;
}
// A `Property` consists of a user-supplied name/value pair.
message Property {
// Name of the property.
string name = 1;
// Value of the property.
string value = 2;
}
// Set of detected entity features.
message EntityAnnotation {
// Opaque entity ID. Some IDs may be available in
// [Google Knowledge Graph Search API](https://developers.google.com/knowledge-graph/).
string mid = 1;
// The language code for the locale in which the entity textual
// `description` is expressed.
string locale = 2;
// Entity textual description, expressed in its `locale` language.
string description = 3;
// Overall score of the result. Range [0, 1].
float score = 4;
// The accuracy of the entity detection in an image.
// For example, for an image in which the "Eiffel Tower" entity is detected,
// this field represents the confidence that there is a tower in the query
// image. Range [0, 1].
float confidence = 5;
// The relevancy of the ICA (Image Content Annotation) label to the
// image. For example, the relevancy of "tower" is likely higher to an image
// containing the detected "Eiffel Tower" than to an image containing a
// detected distant towering building, even though the confidence that
// there is a tower in each image may be the same. Range [0, 1].
float topicality = 6;
// Image region to which this entity belongs. Currently not produced
// for `LABEL_DETECTION` features. For `TEXT_DETECTION` (OCR), `boundingPoly`s
// are produced for the entire text detected in an image region, followed by
// `boundingPoly`s for each word within the detected text.
BoundingPoly bounding_poly = 7;
// The location information for the detected entity. Multiple
// `LocationInfo` elements can be present because one location may
// indicate the location of the scene in the image, and another location
// may indicate the location of the place where the image was taken.
// Location information is usually present for landmarks.
repeated LocationInfo locations = 8;
// Some entities may have optional user-supplied `Property` (name/value)
// fields, such a score or string that qualifies the entity.
repeated Property properties = 9;
}
// Set of features pertaining to the image, computed by computer vision
// methods over safe-search verticals (for example, adult, spoof, medical,
// violence).
message SafeSearchAnnotation {
// Represents the adult content likelihood for the image.
Likelihood adult = 1;
// Spoof likelihood. The likelihood that an modification
// was made to the image's canonical version to make it appear
// funny or offensive.
Likelihood spoof = 2;
// Likelihood that this is a medical image.
Likelihood medical = 3;
// Violence likelihood.
Likelihood violence = 4;
}
// Rectangle determined by min and max `LatLng` pairs.
message LatLongRect {
// Min lat/long pair.
google.type.LatLng min_lat_lng = 1;
// Max lat/long pair.
google.type.LatLng max_lat_lng = 2;
}
// Color information consists of RGB channels, score, and the fraction of
// the image that the color occupies in the image.
message ColorInfo {
// RGB components of the color.
google.type.Color color = 1;
// Image-specific score for this color. Value in range [0, 1].
float score = 2;
// The fraction of pixels the color occupies in the image.
// Value in range [0, 1].
float pixel_fraction = 3;
}
// Set of dominant colors and their corresponding scores.
message DominantColorsAnnotation {
// RGB color values with their score and pixel fraction.
repeated ColorInfo colors = 1;
}
// Stores image properties, such as dominant colors.
message ImageProperties {
// If present, dominant colors completed successfully.
DominantColorsAnnotation dominant_colors = 1;
}
// Single crop hint that is used to generate a new crop when serving an image.
message CropHint {
// The bounding polygon for the crop region. The coordinates of the bounding
// box are in the original image's scale, as returned in `ImageParams`.
BoundingPoly bounding_poly = 1;
// Confidence of this being a salient region. Range [0, 1].
float confidence = 2;
// Fraction of importance of this salient region with respect to the original
// image.
float importance_fraction = 3;
}
// Set of crop hints that are used to generate new crops when serving images.
message CropHintsAnnotation {
repeated CropHint crop_hints = 1;
}
// Parameters for crop hints annotation request.
message CropHintsParams {
// Aspect ratios in floats, representing the ratio of the width to the height
// of the image. For example, if the desired aspect ratio is 4/3, the
// corresponding float value should be 1.33333. If not specified, the
// best possible crop is returned. The number of provided aspect ratios is
// limited to a maximum of 16; any aspect ratios provided after the 16th are
// ignored.
repeated float aspect_ratios = 1;
}
// Image context and/or feature-specific parameters.
message ImageContext {
// lat/long rectangle that specifies the location of the image.
LatLongRect lat_long_rect = 1;
// List of languages to use for TEXT_DETECTION. In most cases, an empty value
// yields the best results since it enables automatic language detection. For
// languages based on the Latin alphabet, setting `language_hints` is not
// needed. In rare cases, when the language of the text in the image is known,
// setting a hint will help get better results (although it will be a
// significant hindrance if the hint is wrong). Text detection returns an
// error if one or more of the specified languages is not one of the
// [supported languages](/vision/docs/languages).
repeated string language_hints = 2;
// Parameters for crop hints annotation request.
CropHintsParams crop_hints_params = 4;
}
// Request for performing Google Cloud Vision API tasks over a user-provided
// image, with user-requested features.
message AnnotateImageRequest {
// The image to be processed.
Image image = 1;
// Requested features.
repeated Feature features = 2;
// Additional context that may accompany the image.
ImageContext image_context = 3;
}
// Response to an image annotation request.
message AnnotateImageResponse {
// If present, face detection has completed successfully.
repeated FaceAnnotation face_annotations = 1;
// If present, landmark detection has completed successfully.
repeated EntityAnnotation landmark_annotations = 2;
// If present, logo detection has completed successfully.
repeated EntityAnnotation logo_annotations = 3;
// If present, label detection has completed successfully.
repeated EntityAnnotation label_annotations = 4;
// If present, text (OCR) detection or document (OCR) text detection has
// completed successfully.
repeated EntityAnnotation text_annotations = 5;
// If present, text (OCR) detection or document (OCR) text detection has
// completed successfully.
// This annotation provides the structural hierarchy for the OCR detected
// text.
TextAnnotation full_text_annotation = 12;
// If present, safe-search annotation has completed successfully.
SafeSearchAnnotation safe_search_annotation = 6;
// If present, image properties were extracted successfully.
ImageProperties image_properties_annotation = 8;
// If present, crop hints have completed successfully.
CropHintsAnnotation crop_hints_annotation = 11;
// If present, web detection has completed successfully.
WebDetection web_detection = 13;
// If set, represents the error message for the operation.
// Note that filled-in image annotations are guaranteed to be
// correct, even when `error` is set.
google.rpc.Status error = 9;
}
// Multiple image annotation requests are batched into a single service call.
message BatchAnnotateImagesRequest {
// Individual image annotation requests for this batch.
repeated AnnotateImageRequest requests = 1;
}
// Response to a batch image annotation request.
message BatchAnnotateImagesResponse {
// Individual responses to image annotation requests within the batch.
repeated AnnotateImageResponse responses = 1;
}
// A bucketized representation of likelihood, which is intended to give clients
// highly stable results across model upgrades.
enum Likelihood {
// Unknown likelihood.
UNKNOWN = 0;
// It is very unlikely that the image belongs to the specified vertical.
VERY_UNLIKELY = 1;
// It is unlikely that the image belongs to the specified vertical.
UNLIKELY = 2;
// It is possible that the image belongs to the specified vertical.
POSSIBLE = 3;
// It is likely that the image belongs to the specified vertical.
LIKELY = 4;
// It is very likely that the image belongs to the specified vertical.
VERY_LIKELY = 5;
}