diff --git a/cvat-ui/src/components/annotation-page/standard-workspace/controls-side-bar/tools-control.tsx b/cvat-ui/src/components/annotation-page/standard-workspace/controls-side-bar/tools-control.tsx index 7d04b64ef2b8..d997fa3755b3 100644 --- a/cvat-ui/src/components/annotation-page/standard-workspace/controls-side-bar/tools-control.tsx +++ b/cvat-ui/src/components/annotation-page/standard-workspace/controls-side-bar/tools-control.tsx @@ -269,6 +269,7 @@ export class ToolsControlComponent extends React.PureComponent { }); canvasInstance.html().addEventListener('canvas.interacted', this.interactionListener); canvasInstance.html().addEventListener('canvas.canceled', this.cancelListener); + canvasInstance.html().addEventListener('canvas.drawn', this.interactionListener); } public componentDidUpdate(prevProps: Props, prevState: State): void { @@ -333,6 +334,7 @@ export class ToolsControlComponent extends React.PureComponent { onRemoveAnnotations(null); canvasInstance.html().removeEventListener('canvas.interacted', this.interactionListener); canvasInstance.html().removeEventListener('canvas.canceled', this.cancelListener); + canvasInstance.html().removeEventListener('canvas.drawn', this.interactionListener); } private contextmenuDisabler = (e: MouseEvent): void => { @@ -503,8 +505,9 @@ export class ToolsControlComponent extends React.PureComponent { try { const { points } = (e as CustomEvent).detail.shapes[0]; + const shapeType = points.length === 4 ? ShapeType.RECTANGLE : ShapeType.POLYGON; const state = new core.classes.ObjectState({ - shapeType: ShapeType.RECTANGLE, + shapeType, // changed by aashutosh objectType: ObjectType.TRACK, source: core.enums.Source.SEMI_AUTO, zOrder: curZOrder, @@ -592,7 +595,7 @@ export class ToolsControlComponent extends React.PureComponent { const portals = !activeTracker ? [] : states - .filter((objectState) => objectState.objectType === 'track' && objectState.shapeType === 'rectangle') + .filter((objectState) => objectState.objectType === 'track' && objectState.shapeType === 'polygon') // changed by aashutosh .map((objectState: any): React.ReactPortal | null => { const { clientID } = objectState; const selectorID = `#cvat-objects-sidebar-state-item-${clientID}`; @@ -821,7 +824,9 @@ export class ToolsControlComponent extends React.PureComponent { job: jobInstance.id, }) as TrackerResults; - response.shapes = response.shapes.map(trackedRectangleMapper); + if (response.shapes[0].length === 4) { + response.shapes = response.shapes.map(trackedRectangleMapper); + } for (let i = 0; i < trackableObjects.clientIDs.length; i++) { const clientID = trackableObjects.clientIDs[i]; const shape = response.shapes[i]; @@ -1046,7 +1051,32 @@ export class ToolsControlComponent extends React.PureComponent { } }} > - Track + Track Rectangle + + + + diff --git a/serverless/pytorch/redefine/cutie/nuclio/config/eval_config.yaml b/serverless/pytorch/redefine/cutie/nuclio/config/eval_config.yaml new file mode 100644 index 000000000000..c43cfceaa9ad --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/config/eval_config.yaml @@ -0,0 +1,147 @@ +defaults: + - _self_ + - model: base + - override hydra/job_logging: custom-no-rank.yaml + +hydra: + run: + dir: ../output/${exp_id}/${dataset} + output_subdir: ${now:%Y-%m-%d_%H-%M-%S}-hydra + +exp_id: default +dataset: d17-val +amp: False +weights: /opt/nuclio/cutie/cutie-base-mega.pth +output_dir: null # defaults to run_dir; specify this to override +flip_aug: False + +# maximum shortest side of the input; -1 means no resizing +# With eval_vos.py, we usually just use the dataset's size (resizing done in dataloader) +# this parameter is added for the sole purpose for the GUI in the current codebase +# InferenceCore will downsize the input and restore the output to the original size if needed +# if you are using this code for some other project, you can also utilize this parameter +max_internal_size: -1 + +# these parameters, when set, override the dataset's default; useful for debugging +image_directory: null +mask_directory: null +json_directory: null +size: null +save_all: null +use_all_masks: null +use_long_term: null +mem_every: null + +# only relevant when long_term is not enabled +max_mem_frames: 5 + +# only relevant when long_term is enabled +long_term: + count_usage: True + max_mem_frames: 10 + min_mem_frames: 5 + num_prototypes: 128 + max_num_tokens: 10000 + buffer_tokens: 2000 + +top_k: 30 +stagger_updates: 5 +chunk_size: -1 # number of objects to process in parallel; -1 means unlimited +save_scores: False +save_aux: False +visualize: False + +datasets: + d16-val: + image_directory: ../DAVIS/2016/JPEGImages/480p + mask_directory: ../DAVIS/2016/Annotations/480p + subset: ../DAVIS/2017/trainval/ImageSets/2016/val.txt + size: 480 + save_all: True + use_all_masks: False + use_long_term: False + mem_every: 5 + d17-val: + image_directory: ../DAVIS/2017/trainval/JPEGImages/480p + mask_directory: ../DAVIS/2017/trainval/Annotations/480p + subset: ../DAVIS/2017/trainval/ImageSets/2017/val.txt + size: 480 + save_all: True + use_all_masks: False + use_long_term: False + mem_every: 5 + d17-test-dev: + image_directory: ../DAVIS/2017/test-dev/JPEGImages/480p + mask_directory: ../DAVIS/2017/test-dev/Annotations/480p + subset: ../DAVIS/2017/test-dev/ImageSets/2017/test-dev.txt + size: 480 + save_all: True + use_all_masks: False + use_long_term: False + mem_every: 5 + y18-val: + image_directory: ../YouTube2018/all_frames/valid_all_frames/JPEGImages + mask_directory: ../YouTube2018/valid/Annotations + json_directory: ../YouTube2018/valid/meta.json + size: 480 + save_all: False + use_all_masks: True + use_long_term: False + mem_every: 5 + y19-val: + image_directory: ../YouTube/all_frames/valid_all_frames/JPEGImages + mask_directory: ../YouTube/valid/Annotations + json_directory: ../YouTube/valid/meta.json + size: 480 + save_all: False + use_all_masks: True + use_long_term: False + mem_every: 5 + mose-val: + image_directory: ../MOSE/valid/JPEGImages + mask_directory: ../MOSE/valid/Annotations + size: 480 + save_all: True + use_all_masks: False + use_long_term: False + mem_every: 5 + generic: + image_directory: null + mask_directory: null + size: -1 + save_all: True + use_all_masks: False + use_long_term: True + mem_every: 5 + burst-val: + skip_frames: -1 + image_directory: ../BURST/frames/val + json_directory: ../BURST/val/first_frame_annotations.json + size: 600 + save_all: False + use_long_term: True + mem_every: 10 + burst-test: + skip_frames: -1 + image_directory: ../BURST/frames/test + json_directory: ../BURST/test/first_frame_annotations.json + size: 600 + save_all: False + use_long_term: True + mem_every: 10 + lvos-val: + image_directory: ../LVOS/valid/JPEGImages + mask_directory: ../LVOS/valid/Annotations_first_only + size: 480 + save_all: False + use_all_masks: True + use_long_term: True + mem_every: 5 + lvos-test: + image_directory: ../LVOS/test/JPEGImages + mask_directory: ../LVOS/test/Annotations + size: 480 + save_all: False + use_all_masks: True + use_long_term: True + mem_every: 5 diff --git a/serverless/pytorch/redefine/cutie/nuclio/config/hydra/job_logging/custom-no-rank.yaml b/serverless/pytorch/redefine/cutie/nuclio/config/hydra/job_logging/custom-no-rank.yaml new file mode 100644 index 000000000000..0173c6839546 --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/config/hydra/job_logging/custom-no-rank.yaml @@ -0,0 +1,22 @@ +# python logging configuration for tasks +version: 1 +formatters: + simple: + format: '[%(asctime)s][%(levelname)s] - %(message)s' + datefmt: '%Y-%m-%d %H:%M:%S' +handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + # absolute file path + filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-eval.log + mode: w +root: + level: INFO + handlers: [console, file] + +disable_existing_loggers: false \ No newline at end of file diff --git a/serverless/pytorch/redefine/cutie/nuclio/config/hydra/job_logging/custom.yaml b/serverless/pytorch/redefine/cutie/nuclio/config/hydra/job_logging/custom.yaml new file mode 100644 index 000000000000..16d4969189b4 --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/config/hydra/job_logging/custom.yaml @@ -0,0 +1,22 @@ +# python logging configuration for tasks +version: 1 +formatters: + simple: + format: '[%(asctime)s][%(levelname)s][r${oc.env:LOCAL_RANK}] - %(message)s' + datefmt: '%Y-%m-%d %H:%M:%S' +handlers: + console: + class: logging.StreamHandler + formatter: simple + stream: ext://sys.stdout + file: + class: logging.FileHandler + formatter: simple + # absolute file path + filename: ${hydra.runtime.output_dir}/${now:%Y-%m-%d_%H-%M-%S}-rank${oc.env:LOCAL_RANK}.log + mode: w +root: + level: INFO + handlers: [console, file] + +disable_existing_loggers: false \ No newline at end of file diff --git a/serverless/pytorch/redefine/cutie/nuclio/config/model/base.yaml b/serverless/pytorch/redefine/cutie/nuclio/config/model/base.yaml new file mode 100644 index 000000000000..ee88cdc4847f --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/config/model/base.yaml @@ -0,0 +1,58 @@ +pixel_mean: [0.485, 0.456, 0.406] +pixel_std: [0.229, 0.224, 0.225] + +pixel_dim: 256 +key_dim: 64 +value_dim: 256 +sensory_dim: 256 +embed_dim: 256 + +pixel_encoder: + type: resnet50 + ms_dims: [1024, 512, 256] + +mask_encoder: + type: resnet18 + final_dim: 256 + +pixel_pe_scale: 32 +pixel_pe_temperature: 128 + +object_transformer: + embed_dim: ${model.embed_dim} + ff_dim: 2048 + num_heads: 8 + num_blocks: 3 + num_queries: 16 + read_from_pixel: + input_norm: False + input_add_pe: False + add_pe_to_qkv: [True, True, False] + read_from_past: + add_pe_to_qkv: [True, True, False] + read_from_memory: + add_pe_to_qkv: [True, True, False] + read_from_query: + add_pe_to_qkv: [True, True, False] + output_norm: False + query_self_attention: + add_pe_to_qkv: [True, True, False] + pixel_self_attention: + add_pe_to_qkv: [True, True, False] + +object_summarizer: + embed_dim: ${model.object_transformer.embed_dim} + num_summaries: ${model.object_transformer.num_queries} + add_pe: True + +aux_loss: + sensory: + enabled: True + weight: 0.01 + query: + enabled: True + weight: 0.01 + +mask_decoder: + # first value must equal embed_dim + up_dims: [256, 128, 128] diff --git a/serverless/pytorch/redefine/cutie/nuclio/function-gpu.yaml b/serverless/pytorch/redefine/cutie/nuclio/function-gpu.yaml new file mode 100644 index 000000000000..379fe2ffd334 --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/function-gpu.yaml @@ -0,0 +1,75 @@ +# Copyright (C) 2023-2024 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +metadata: + name: pth-redefine-cutie + namespace: redefine + annotations: + name: Cutie + version: 1 + type: tracker + spec: + help_message: The tracker brings video-object-segmentation capabilities to cvat. + +spec: + description: Video object segmentation with Cutie + runtime: 'python:3.9' + handler: main:handler + eventTimeout: 30s + env: + - name: PYTHONPATH + value: /opt/nuclio/cutie + + build: + image: cvat.pth.redefine.cutie:latest-gpu + baseImage: nvidia/cuda:11.8.0-devel-ubuntu20.04 + + directives: + preCopy: + # set NVIDIA container runtime settings + - kind: ENV + value: NVIDIA_VISIBLE_DEVICES=all + - kind: ENV + value: NVIDIA_DRIVER_CAPABILITIES=compute,utility + # disable interactive frontend + - kind: ENV + value: DEBIAN_FRONTEND=noninteractive + # set workdir + - kind: WORKDIR + value: /opt/nuclio/cutie + # install basic deps + - kind: RUN + value: apt-get update && apt-get -y install curl git python3 python3-pip ffmpeg libsm6 libxext6 + # install cutie deps + - kind: RUN + value: pip3 install torch==2.3.1+cu118 torchvision==0.18.1+cu118 torchaudio==2.3.1 -f https://download.pytorch.org/whl/torch_stable.html + # install cutie code + - kind: RUN + value: pip3 install git+https://github.com/hkchengrex/Cutie.git + # download weights + - kind: RUN + value: curl -LO https://github.com/hkchengrex/Cutie/releases/download/v1.0/coco_lvis_h18_itermask.pth + # download weights + - kind: RUN + value: curl -LO https://github.com/hkchengrex/Cutie/releases/download/v1.0/cutie-base-mega.pth + # map pip3 and python3 to pip and python + - kind: RUN + value: ln -s /usr/bin/pip3 /usr/local/bin/pip && ln -s /usr/bin/python3 /usr/bin/python + triggers: + myHttpTrigger: + maxWorkers: 1 + kind: 'http' + workerAvailabilityTimeoutMilliseconds: 10000 + attributes: + maxRequestBodySize: 33554432 # 32MB + resources: + limits: + nvidia.com/gpu: 1 + + platform: + attributes: + restartPolicy: + name: always + maximumRetryCount: 3 + mountMode: volume diff --git a/serverless/pytorch/redefine/cutie/nuclio/function.yaml b/serverless/pytorch/redefine/cutie/nuclio/function.yaml new file mode 100644 index 000000000000..796b72f76e10 --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/function.yaml @@ -0,0 +1,72 @@ +# Copyright (C) 2023-2024 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +metadata: + name: pth-redefine-cutie + namespace: cvat + annotations: + name: Cutie + version: 1 + type: tracker + spec: + help_message: The tracker brings video-object-segmentation capabilities to cvat. + +spec: + description: Video object segmentation with Cutie + runtime: 'python:3.9' + handler: main:handler + eventTimeout: 30s + env: + - name: PYTHONPATH + value: /opt/nuclio/cutie + + build: + image: cvat.pth.redefine.cutie + baseImage: ubuntu:22.04 + + directives: + preCopy: + # disable interactive frontend + - kind: ENV + value: DEBIAN_FRONTEND=noninteractive + # set workdir + - kind: WORKDIR + value: /opt/nuclio/cutie + # install basic deps + - kind: RUN + value: apt-get update && apt-get -y install curl git python3 python3-pip ffmpeg libsm6 libxext6 + # install cutie deps + - kind: RUN + value: pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + - kind: RUN + value: pip3 install pycocotools matplotlib onnxruntime onnx jsonpickle + # install cutie code + - kind: RUN + value: pip3 install git+https://github.com/hkchengrex/Cutie.git + # download weights + - kind: RUN + value: curl -LO https://github.com/hkchengrex/Cutie/releases/download/v1.0/coco_lvis_h18_itermask.pth -o /opt/nuclio/cutie/coco_lvis_h18_itermask.pth + # download weights + - kind: RUN + value: curl -LO https://github.com/hkchengrex/Cutie/releases/download/v1.0/cutie-base-mega.pth -o /opt/nuclio/cutie/cutie-base-mega.pth + # map pip3 and python3 to pip and python + - kind: RUN + value: ln -s /usr/bin/pip3 /usr/local/bin/pip && ln -s /usr/bin/python3 /usr/bin/python + triggers: + myHttpTrigger: + maxWorkers: 2 + kind: 'http' + workerAvailabilityTimeoutMilliseconds: 10000 + attributes: + maxRequestBodySize: 1073741824 # 1GB + resources: + limits: + nvidia.com/gpu: 1 + + platform: + attributes: + restartPolicy: + name: always + maximumRetryCount: 3 + mountMode: volume \ No newline at end of file diff --git a/serverless/pytorch/redefine/cutie/nuclio/main.py b/serverless/pytorch/redefine/cutie/nuclio/main.py new file mode 100644 index 000000000000..5a1e5e41476d --- /dev/null +++ b/serverless/pytorch/redefine/cutie/nuclio/main.py @@ -0,0 +1,42 @@ +# Copyright (C) 2023 CVAT.ai Corporation +# +# SPDX-License-Identifier: MIT + +import json +import base64 +from PIL import Image +import io +import numpy as np +from model_handler import ModelHandler + +def init_context(context): + context.logger.info("Init context... 0%") + model = ModelHandler() + context.user_data.model = model + context.logger.info("Init context...100%") + +def handler(context, event): + context.logger.info("Run cutie model") + data = event.body + buf = io.BytesIO(base64.b64decode(data["image"])) + shapes = data.get("shapes") + states = data.get("states") + image = Image.open(buf).convert("RGB") + image = np.asarray(image) + + results = { + "shapes": [], + "states": [] + } + + for i, shape in enumerate(shapes): + shape, state = context.user_data.model.handle(image, shape, states[i] if i np.ndarray: + """Converts a polygon to a mask. + Args: + image (np.ndarray): The image to get the shape from. + points (list): The points of the polygon. + Returns: + np.ndarray: The mask of the polygon. + """ + h, w = image.shape[:2] + if len(points) == 0: + return np.zeros((h, w), dtype=np.int32) + points = np.array(points).reshape(-1, 2).astype(np.int32) + mask = np.zeros((h, w), dtype=np.int32) + mask = cv2.fillPoly(mask, [points], 1) + return mask + +def convert_mask_to_polygon(mask: np.ndarray) -> list: + """Converts a mask to a polygon. + Since tracking designed on one object at that time, only the largest contour is returned. + Args: + mask (np.ndarray): The mask to get contours from. + Returns: + list: The polygon of the mask. + """ + + # contours = None + # if int(cv2.__version__.split('.')[0]) > 3: + # contours = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0] + # else: + # contours = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[1] + + # contours = max(contours, key=lambda arr: arr.size) + # if contours.shape.count(1): + # contours = np.squeeze(contours) + # if contours.size < 3 * 2: + # raise Exception('Less then three point have been detected. Can not build a polygon.') + + # polygon = [] + # for point in contours: + # polygon.append([int(point[0]), int(point[1])]) + + contours = cv2.findContours( + mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_TC89_KCOS + )[0] + + if len(contours) == 0: + return [] + + contours = max(contours, key=lambda arr: arr.size) + + if contours.shape.count(1): + contours = np.squeeze(contours) + + if contours.size < 3 * 2: + return [] + + else: + return contours.reshape(-1).tolist() + +def prepare_numpy_image(image: np.ndarray, device: Literal["cpu", "cuda"]): + """Converts a numpy image to a torch tensor with the correct shape and dtype. + Args: + image (np.ndarray): The image to convert. Should be in the format (H, W, 3) and dtype uint8. + Returns: + torch.Tensor: The converted image, normalized, on the correct device as float32 with shape (3, H, W). + """ + + assert image.shape[2] == 3, "Image should be in the format (H, W, 3)" + assert len(image.shape) == 3, "Image should be of shape (H, W, 3)" + assert image.dtype == np.uint8, "Image should be of dtype uint8" + assert device in ["cpu", "cuda"], "Device should be either 'cpu' or 'cuda'" + + # Put the image on the correct device + image = torch.from_numpy(image.copy()).to(device=device) + # Convert to float32 and transpose to (3, H, W) + image = image.permute(2, 0, 1).float() + # Normalize the image + image = image / 255.0 + return image + +class ModelHandler: + def __init__(self) -> CUTIE: + self.device = 'cuda' if torch.cuda.is_available() else 'cpu' + coco_lvis_weights = '/opt/nuclio/cutie/coco_lvis_h18_itermask.pth' + cutie_base_weights = '/opt/nuclio/cutie/cutie-base-mega.pth' + + assert os.path.exists(cutie_base_weights), f"{cutie_base_weights} does not exist" + + # load configurations + initialize(version_base='1.3.2', config_path="./config", job_name="eval_config") + cfg = compose(config_name="eval_config") + with open_dict(cfg): + cfg['weights'] = cutie_base_weights + get_dataset_cfg(cfg) + + # load model + cutie = CUTIE(cfg).to(self.device).eval() + model_weights = torch.load(cutie_base_weights, map_location=torch.device(self.device)) + cutie.load_weights(model_weights) + + # use one processor per video + # self.processor = InferenceCore(cutie, cfg=cutie.cfg) + # self.processor.max_internal_size = 480 + self.cutie = cutie + + def encode_state(self, state): + # state.pop('net', None) + + for k,v in state.items(): + state[k] = jsonpickle.encode(v) + + return state + + def decode_state(self, state): + for k,v in state.items(): + state[k] = jsonpickle.decode(v) + + # state['net'] = copy(self.cutie) + + self.cutie = state['net'] + self.processor = InferenceCore(self.cutie, cfg=self.cutie.cfg) + self.processor.max_internal_size = 480 + + def handle(self, image: np.array, shape: Optional[List[float]]=None, state: Optional[Dict]=None)->Tuple[List[float], Optional[Dict]]: + image = prepare_numpy_image(image, self.device) + + if state is None: + mask = convert_polygon_to_mask(image, shape) # TODO: only handles single object, need to make sure multi-object can be dealt with + objects = np.unique(mask) + # background '0' does not count as an object + objects = objects[objects != 0].tolist() + mask = torch.from_numpy(mask).to(self.device) + self.processor = InferenceCore(self.cutie, cfg=self.cutie.cfg) + self.processor.max_internal_size = 480 + output_prob = self.processor.step(image, mask, objects=objects) + state = {} + state['net'] = self.cutie + state = self.encode_state(state) + else: + self.decode_state(state) + output_prob = self.processor.step(image) + state = self.encode_state(state) + + mask = self.processor.output_prob_to_mask(output_prob).cpu().numpy().astype(np.uint8) + + shape = convert_mask_to_polygon(mask)# Convert a mask to a polygon + + + return shape, state + diff --git a/serverless/pytorch/redefine/cutie/nuclio/sample/sample.jpg b/serverless/pytorch/redefine/cutie/nuclio/sample/sample.jpg new file mode 100644 index 000000000000..ef6b54eb69bd Binary files /dev/null and b/serverless/pytorch/redefine/cutie/nuclio/sample/sample.jpg differ