diff --git a/docs/tutorials/image_to_3d.md b/docs/tutorials/image_to_3d.md index 0081d6b..9233ade 100644 --- a/docs/tutorials/image_to_3d.md +++ b/docs/tutorials/image_to_3d.md @@ -5,7 +5,13 @@ Generate **physically plausible 3D assets** from a single input image, supportin --- ## ⚡ Command-Line Usage -Support the use of [SAM3D](https://github.com/facebookresearch/sam-3d-objects) or [TRELLIS](https://github.com/microsoft/TRELLIS) as 3D generation model, modify `IMAGE3D_MODEL` in `embodied_gen/scripts/imageto3d.py` to switch model. +Three 3D generation backends are supported: + +- [`SAM3D`](https://github.com/facebookresearch/sam-3d-objects) — local model (default) +- [`TRELLIS`](https://github.com/microsoft/TRELLIS) — local model +- `HUNYUAN3D` — Tencent Hunyuan3D Pro cloud API (no local GPU model needed) + +Select the backend via `--image3d_model` (case-insensitive). Omit to use the default `SAM3D`. ```bash img3d-cli --image_path apps/assets/example_image/sample_00.jpg \ @@ -13,6 +19,18 @@ apps/assets/example_image/sample_01.jpg \ --n_retry 2 --output_root outputs/imageto3d ``` +### Using the Hunyuan3D Cloud Backend + +Hunyuan3D Pro runs entirely on Tencent Cloud — useful when you don't have a local GPU. It requires Tencent Cloud Hunyuan3D `SecretId` / `SecretKey` and network access to `ai3d.tencentcloudapi.com` and the COS download host. + +```bash +export TENCENT_SECRET_ID='your-secret-id' +export TENCENT_SECRET_KEY='your-secret-key' +img3d-cli --image3d_model HUNYUAN3D \ + --image_path apps/assets/example_image/sample_00.jpg \ + --output_root outputs/imageto3d_hunyuan +``` + You will get the following results:
diff --git a/docs/tutorials/text_to_3d.md b/docs/tutorials/text_to_3d.md index 0c4b0dc..44e083c 100644 --- a/docs/tutorials/text_to_3d.md +++ b/docs/tutorials/text_to_3d.md @@ -80,6 +80,27 @@ bash embodied_gen/scripts/textto3d.sh \ > Models with more permissive licenses can be found in `embodied_gen/models/image_comm_model.py`. +### Choosing the 3D Backend + +Three 3D generation backends are supported via `--image3d_model` (case-insensitive): + +- `SAM3D` (default) — text → image → 3D, local SAM3D model +- `TRELLIS` — text → image → 3D, local TRELLIS model +- `HUNYUAN3D` — Tencent Hunyuan3D Pro **text-to-3D** API; skips the text-to-image stage entirely and generates 3D directly from the prompt + +### Using the Hunyuan3D Cloud Backend + +Hunyuan3D Pro takes the prompt directly to a 3D mesh (no GPU model loaded locally; one job ≈ 3 minutes; Tencent Cloud is billed per submit). Set up credentials once: + +```bash +export TENCENT_SECRET_ID='your-secret-id' +export TENCENT_SECRET_KEY='your-secret-key' +text3d-cli --image3d_model HUNYUAN3D \ + --prompts "small bronze figurine of a lion" \ + --output_root outputs/textto3d_hunyuan +``` + + The generated results are organized as follows: ```sh diff --git a/embodied_gen/data/backproject_v3.py b/embodied_gen/data/backproject_v3.py index 81cea59..2c053c3 100644 --- a/embodied_gen/data/backproject_v3.py +++ b/embodied_gen/data/backproject_v3.py @@ -419,7 +419,7 @@ def parse_args(): parser.add_argument( "--save_glb_path", type=str, default=None, help="Save glb path." ) - parser.add_argument("--n_max_faces", type=int, default=30000) + parser.add_argument("--n_max_faces", type=int, default=50000) args, unknown = parser.parse_known_args() return args diff --git a/embodied_gen/data/utils.py b/embodied_gen/data/utils.py index 74f96c6..f50bcaf 100644 --- a/embodied_gen/data/utils.py +++ b/embodied_gen/data/utils.py @@ -727,6 +727,7 @@ def save_mesh_with_mtl( output_path: str, material_base=(250, 250, 250, 255), mesh_process: bool = True, + glossiness: float = 250.0, ) -> trimesh.Trimesh: if isinstance(texture, np.ndarray): texture = Image.fromarray(texture) @@ -742,6 +743,8 @@ def save_mesh_with_mtl( diffuse=material_base, ambient=material_base, specular=material_base, + # 250 gives a tight visible highlight similar to glossy plastic. + glossiness=glossiness, ) dir_name = os.path.dirname(output_path) diff --git a/embodied_gen/models/hunyuan3d.py b/embodied_gen/models/hunyuan3d.py new file mode 100644 index 0000000..21102f8 --- /dev/null +++ b/embodied_gen/models/hunyuan3d.py @@ -0,0 +1,957 @@ +# Project EmbodiedGen +# +# Copyright (c) 2025 Horizon Robotics. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +# implied. See the License for the specific language governing +# permissions and limitations under the License. + +from __future__ import annotations + +import base64 +import hashlib +import hmac +import json +import os +import re +import socket +import time +import urllib.error +import urllib.request +from dataclasses import dataclass +from datetime import datetime, timezone +from glob import glob +from http.client import HTTPSConnection +from shutil import copy, copytree, rmtree +from typing import Optional, Tuple + +import numpy as np +import trimesh +from PIL import Image +from embodied_gen.data.differentiable_render import ( + entrypoint as render_pbr_video, +) +from embodied_gen.data.utils import delete_dir +from embodied_gen.utils.gpt_clients import GPT_CLIENT +from embodied_gen.utils.log import logger +from embodied_gen.utils.process_media import combine_images_to_grid +from embodied_gen.utils.tags import VERSION +from embodied_gen.validators.quality_checkers import ( + BaseChecker, + ImageSegChecker, +) +from embodied_gen.validators.urdf_convertor import URDFGenerator + + +@dataclass(frozen=True) +class HunyuanConfig: + """Tencent Hunyuan3D Pro endpoint + timing. + + Defaults match the validated probe in ``outputs/hunyuan3d_api_expert/``. + Only the Pro action set is supported. + """ + + host: str = "ai3d.tencentcloudapi.com" + service: str = "ai3d" + region: str = "ap-guangzhou" + version: str = "2025-05-13" + image_action: str = "SubmitHunyuanTo3DProJob" + query_action: str = "QueryHunyuanTo3DProJob" + result_format: str = "GLB" + texture_size: int = 2048 + connect_timeout: float = 10.0 + read_timeout: float = 60.0 + poll_interval: float = 10.0 + max_wait_seconds: float = 900.0 + max_download_bytes: int = 512 * 1024 * 1024 + + +def load_credentials() -> Tuple[str, str]: + """Read Tencent Cloud SecretId/SecretKey from environment. + + Prefers ``TENCENT_SECRET_ID/KEY``; falls back to ``TENCENTCLOUD_*``. + Raises ``RuntimeError`` (credential-free message) when missing. + """ + sid = os.environ.get("TENCENT_SECRET_ID") or os.environ.get( + "TENCENTCLOUD_SECRET_ID" + ) + skey = os.environ.get("TENCENT_SECRET_KEY") or os.environ.get( + "TENCENTCLOUD_SECRET_KEY" + ) + if not sid or not skey: + raise RuntimeError( + "HUNYUAN3D backend requires Tencent Cloud credentials. Set " + "TENCENT_SECRET_ID and TENCENT_SECRET_KEY (or TENCENTCLOUD_*) " + "in the environment, e.g. `source .secrets/hunyuan3d.env`." + ) + return sid, skey + + +def _signed_headers( + payload: str, + action: str, + credentials: Tuple[str, str], + cfg: HunyuanConfig, +) -> dict: + """Build fresh TC3-HMAC-SHA256 auth headers (re-built every request).""" + sid, skey = credentials + ts = int(time.time()) + date = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d") + canon_headers = ( + "content-type:application/json; charset=utf-8\n" + f"host:{cfg.host}\n" + f"x-tc-action:{action.lower()}\n" + ) + signed = "content-type;host;x-tc-action" + canon_req = ( + f"POST\n/\n\n{canon_headers}\n{signed}\n" + f"{hashlib.sha256(payload.encode()).hexdigest()}" + ) + scope = f"{date}/{cfg.service}/tc3_request" + string_to_sign = ( + f"TC3-HMAC-SHA256\n{ts}\n{scope}\n" + f"{hashlib.sha256(canon_req.encode()).hexdigest()}" + ) + + def _sign(key, msg): + return hmac.new(key, msg.encode(), hashlib.sha256).digest() + + sd = _sign(("TC3" + skey).encode(), date) + ss = _sign(sd, cfg.service) + signing = _sign(ss, "tc3_request") + sig = hmac.new( + signing, string_to_sign.encode(), hashlib.sha256 + ).hexdigest() + return { + "Authorization": ( + f"TC3-HMAC-SHA256 Credential={sid}/{scope}, " + f"SignedHeaders={signed}, Signature={sig}" + ), + "Content-Type": "application/json; charset=utf-8", + "Host": cfg.host, + "X-TC-Action": action, + "X-TC-Timestamp": str(ts), + "X-TC-Version": cfg.version, + "X-TC-Region": cfg.region, + } + + +def _post_signed( + payload_obj: dict, + action: str, + credentials: Tuple[str, str], + cfg: HunyuanConfig, +) -> dict: + """POST a Tencent Cloud TC3-signed JSON request and return ``Response``. + + Routes through ``HTTPS_PROXY`` via CONNECT (``HTTPSConnection`` does not + honor the env var on its own). Never logs credentials, signed headers, + or the request payload (which carries base64 image data). + """ + payload = json.dumps( + payload_obj, separators=(",", ":"), ensure_ascii=False + ) + headers = _signed_headers(payload, action, credentials, cfg) + # ``http.client.HTTPSConnection`` does NOT auto-honor ``HTTPS_PROXY`` + # (unlike ``urllib.request.urlopen``); read it explicitly and tunnel + # via CONNECT, otherwise direct connections to Tencent Cloud will be + # blocked by the corporate egress firewall. + proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("HTTP_PROXY") + timeout = cfg.connect_timeout + cfg.read_timeout + if proxy: + from urllib.parse import urlparse + + p = urlparse(proxy) + conn = HTTPSConnection(p.hostname, p.port or 80, timeout=timeout) + conn.set_tunnel(cfg.host, 443) + else: + conn = HTTPSConnection(cfg.host, timeout=timeout) + try: + conn.request("POST", "/", body=payload.encode(), headers=headers) + resp = conn.getresponse() + status, body = resp.status, resp.read().decode(errors="replace") + finally: + conn.close() + + if not 200 <= status < 300: + raise RuntimeError( + f"Hunyuan3D {action} HTTP {status}; len={len(body)}." + ) + try: + data = json.loads(body).get("Response", {}) + except json.JSONDecodeError as exc: + raise RuntimeError( + f"Hunyuan3D {action} non-JSON (HTTP {status}): {exc}" + ) + err = data.get("Error") + if err: + raise RuntimeError( + f"Hunyuan3D {action} Tencent error: " + f"Code={err.get('Code')} Message={err.get('Message')} " + f"RequestId={data.get('RequestId')}" + ) + return data + + +def submit_pro_job( + image_path: Optional[str] = None, + credentials: Tuple[str, str] = None, + cfg: HunyuanConfig = None, + prompt: Optional[str] = None, +) -> str: + """Submit a Hunyuan3D Pro job, return its ``JobId``. + + Provide exactly one of ``image_path`` (image-to-3D, body field + ``ImageBase64``) or ``prompt`` (text-to-3D, body field ``Prompt``). + Tencent's ``SubmitHunyuanTo3DProJob`` action is shared between both + modes; only the body discriminator differs. + """ + if (image_path is None) == (prompt is None): + raise ValueError( + "submit_pro_job requires exactly one of image_path or prompt." + ) + if credentials is None or cfg is None: + raise ValueError("credentials and cfg are required.") + + payload = {"ResultFormat": cfg.result_format, "EnablePBR": True} + if image_path is not None: + if not os.path.isfile(image_path): + raise FileNotFoundError( + f"Hunyuan3D input image missing: {image_path}" + ) + with open(image_path, "rb") as fh: + payload["ImageBase64"] = base64.b64encode(fh.read()).decode() + mode = "image" + else: + payload["Prompt"] = prompt + mode = "text" + + resp = _post_signed(payload, cfg.image_action, credentials, cfg) + job_id = resp.get("JobId") + if not job_id: + raise RuntimeError( + f"Hunyuan3D submit returned no JobId; " + f"RequestId={resp.get('RequestId')}." + ) + logger.info( + "HUNYUAN3D submit OK (%s): JobId=%s RequestId=%s", + mode, + job_id, + resp.get("RequestId"), + ) + return job_id + + +def wait_for_pro_job( + job_id: str, + credentials: Tuple[str, str], + cfg: HunyuanConfig, +) -> dict: + """Poll the job until DONE; raise on FAIL/unknown/timeout.""" + deadline = time.time() + cfg.max_wait_seconds + last_status = None + while True: + resp = _post_signed( + {"JobId": job_id}, cfg.query_action, credentials, cfg + ) + status = resp.get("Status") + if status != last_status: + logger.info( + "HUNYUAN3D job %s status=%s RequestId=%s", + job_id, + status, + resp.get("RequestId"), + ) + if last_status is None and status in ("WAIT", "RUN"): + logger.info( + "HUNYUAN3D Pro inference typically takes ~3 minutes; " + "polling every %ss.", + int(cfg.poll_interval), + ) + last_status = status + if status == "DONE": + return resp + if status == "FAIL": + raise RuntimeError( + f"Hunyuan3D job {job_id} FAIL: " + f"code={resp.get('ErrorCode')} " + f"message={resp.get('ErrorMessage')} " + f"RequestId={resp.get('RequestId')}." + ) + if status not in ("WAIT", "RUN"): + raise RuntimeError( + f"Hunyuan3D job {job_id} unknown status={status!r}; " + f"RequestId={resp.get('RequestId')}." + ) + if time.time() >= deadline: + raise TimeoutError( + f"Hunyuan3D job {job_id} did not finish within " + f"{cfg.max_wait_seconds}s (last status={status})." + ) + time.sleep(cfg.poll_interval) + + +def _download_url_to_path(url: str, dst: str, cfg: HunyuanConfig) -> int: + """Stream ``url`` to ``dst`` with size/timeout caps. Returns bytes written. + + Logs only the host (signed URL paths carry short-lived auth tokens). + """ + from urllib.parse import urlparse + + os.makedirs(os.path.dirname(os.path.abspath(dst)), exist_ok=True) + logger.info( + "HUNYUAN3D downloading %s from host=%s", + os.path.basename(dst), + urlparse(url).hostname or "?", + ) + total = 0 + timeout = cfg.connect_timeout + cfg.read_timeout + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: + if not 200 <= resp.status < 300: + raise RuntimeError( + f"Hunyuan3D download HTTP {resp.status} for " + f"{os.path.basename(dst)}." + ) + with open(dst, "wb") as out: + while True: + chunk = resp.read(1024 * 1024) + if not chunk: + break + total += len(chunk) + if total > cfg.max_download_bytes: + raise RuntimeError( + f"Hunyuan3D download exceeded " + f"{cfg.max_download_bytes} bytes for " + f"{os.path.basename(dst)}." + ) + out.write(chunk) + except (urllib.error.URLError, socket.timeout) as exc: + raise RuntimeError( + f"Hunyuan3D download failed for {os.path.basename(dst)}: {exc}" + ) + return total + + +def acquire_pro_glb( + image_path: Optional[str] = None, + output_dir: str = None, + asset_name: str = None, + credentials: Tuple[str, str] = None, + cfg: HunyuanConfig = None, + prompt: Optional[str] = None, +) -> str: + """End-to-end: submit + poll + download GLB into the output dir. + + Provide exactly one of ``image_path`` or ``prompt`` (see + :func:`submit_pro_job` for the body-field difference). + """ + if ( + output_dir is None + or asset_name is None + or credentials is None + or cfg is None + ): + raise ValueError( + "output_dir, asset_name, credentials and cfg are required." + ) + os.makedirs(output_dir, exist_ok=True) + glb_path = os.path.join(output_dir, f"{asset_name}.glb") + + job_id = submit_pro_job( + image_path=image_path, + prompt=prompt, + credentials=credentials, + cfg=cfg, + ) + resp = wait_for_pro_job(job_id, credentials, cfg) + + files = resp.get("ResultFile3Ds") or [] + glb_url = next( + ( + f.get("Url") + for f in files + if (f.get("Type") or "").upper() == "GLB" and f.get("Url") + ), + None, + ) + if not glb_url: + raise RuntimeError( + f"Hunyuan3D job {job_id} returned no GLB; " + f"RequestId={resp.get('RequestId')}." + ) + _download_url_to_path(glb_url, glb_path, cfg) + return glb_path + + +def _texture_array(tex) -> Optional[np.ndarray]: + """Return RGB ndarray for a glTF texture, or None if absent/invalid.""" + if tex is None or not hasattr(tex, "convert"): + return None + return np.asarray(tex.convert("RGB")) + + +def _save_rgb(arr: Optional[np.ndarray], dst: str, max_edge: int) -> bool: + """Save an RGB texture as PNG, capping the longest edge at ``max_edge``.""" + if arr is None: + return False + img = Image.fromarray(arr) + longest = max(img.size) + if longest > max_edge: + scale = max_edge / float(longest) + img = img.resize( + ( + max(1, int(img.size[0] * scale)), + max(1, int(img.size[1] * scale)), + ), + Image.LANCZOS, + ) + img.save(dst) + return True + + +def _bake_scene_transform( + scene: trimesh.Scene, +) -> Tuple[trimesh.Trimesh, np.ndarray, str]: + """Apply scene-graph transforms to mesh vertices; return one Trimesh.""" + if len(scene.graph.nodes_geometry) != 1: + parts = [] + for n in scene.graph.nodes_geometry: + xform, gname = scene.graph[n] + m = scene.geometry[gname].copy() + m.apply_transform(xform) + parts.append(m) + return trimesh.util.concatenate(parts), np.eye(4), "concatenated" + n = next(iter(scene.graph.nodes_geometry)) + xform, gname = scene.graph[n] + mesh = scene.geometry[gname].copy() + mesh.apply_transform(xform) + return mesh, xform, gname + + +def export_glb_to_obj( + glb_path: str, + output_dir: str, + asset_name: str, + texture_size: int = 2048, + pre_align_rotation: Optional[np.ndarray] = None, +) -> str: + """Convert a Hunyuan3D Pro GLB into the full-PBR OBJ + MTL + PBR PNGs. + + Bakes the GLB scene transform, optionally applies ``pre_align_rotation`` + to the vertex array (used by the text-to-3D path, whose endpoint emits + a frame rotated 90° around the up axis relative to the image-to-3D + endpoint), recenters to the bbox origin (matching SAM3D's convention + of putting the model origin at the geometric center), and writes a + Blender-compatible OBJ/MTL referencing 4 PBR PNGs (baseColor / + metallic / roughness / normal) plus a ``_pbr_material.json`` metadata + sidecar. The source GLB at ``glb_path`` is overwritten with the + aligned mesh so downstream steps can reuse it. Returns the OBJ path. + """ + from trimesh.exchange.obj import export_obj + + os.makedirs(output_dir, exist_ok=True) + obj_path = os.path.join(output_dir, f"{asset_name}.obj") + mtl_path = os.path.join(output_dir, f"{asset_name}.mtl") + json_path = os.path.join(output_dir, f"{asset_name}_pbr_material.json") + + scene = trimesh.load(glb_path, force="scene", process=False) + mesh, baked_xform, geom_name = _bake_scene_transform(scene) + material = getattr(getattr(mesh, "visual", None), "material", None) + + # Align to SAM3D convention: optional pre-rotation (text-to-3D needs a + # -90° around the up axis to share the image-to-3D frame) + recenter to + # the bbox origin. Overwrite the source GLB so downstream steps can + # reuse the aligned full-PBR mesh without an extra load/export pass. + V = np.asarray(mesh.vertices, dtype=np.float32) + if pre_align_rotation is not None: + V = V @ np.asarray(pre_align_rotation, dtype=np.float32) + bbox_center = (V.min(axis=0) + V.max(axis=0)) * 0.5 + mesh.vertices = V - bbox_center + mesh.export(glb_path) + + raw_name = getattr(material, "name", None) or f"{asset_name}_material" + material_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", raw_name).strip("._") + if not material_name: + material_name = f"{asset_name}_material" + + # Write OBJ (rewrite usemtl so it points at our material, after mtllib). + obj_text = export_obj( + mesh, + include_normals=True, + include_color=True, + include_texture=True, + return_texture=False, + write_texture=False, + mtl_name=os.path.basename(mtl_path), + header=( + "Exported from Hunyuan3D Pro GLB; " + "scene transform baked, recentered to bbox origin" + ), + ) + obj_text = re.sub( + r"^usemtl\s+.+$", + f"usemtl {material_name}", + obj_text, + flags=re.MULTILINE, + ) + mtllib_line = f"mtllib {os.path.basename(mtl_path)}\n" + if f"usemtl {material_name}" not in obj_text: + obj_text = obj_text.replace( + mtllib_line, f"{mtllib_line}usemtl {material_name}\n", 1 + ) + with open(obj_path, "w", encoding="utf-8") as fh: + fh.write(obj_text) + + # PBR textures. metallicRoughnessTexture: G=roughness, B=metallic. + base_arr = _texture_array(getattr(material, "baseColorTexture", None)) + mr_arr = _texture_array( + getattr(material, "metallicRoughnessTexture", None) + ) + normal_arr = _texture_array(getattr(material, "normalTexture", None)) + files = { + "baseColor": f"{asset_name}_baseColor.png", + "metallic": f"{asset_name}_metallic.png", + "roughness": f"{asset_name}_roughness.png", + "normal": f"{asset_name}_normal.png", + } + metallic_arr = ( + np.stack([mr_arr[:, :, 2]] * 3, axis=-1) + if mr_arr is not None + else None + ) + roughness_arr = ( + np.stack([mr_arr[:, :, 1]] * 3, axis=-1) + if mr_arr is not None + else None + ) + saved = { + "baseColor": _save_rgb( + base_arr, + os.path.join(output_dir, files["baseColor"]), + texture_size, + ), + "metallic": _save_rgb( + metallic_arr, + os.path.join(output_dir, files["metallic"]), + texture_size, + ), + "roughness": _save_rgb( + roughness_arr, + os.path.join(output_dir, files["roughness"]), + texture_size, + ), + "normal": _save_rgb( + normal_arr, + os.path.join(output_dir, files["normal"]), + texture_size, + ), + } + + def _factor(attr: str, default: float = 1.0) -> float: + v = getattr(material, attr, default) + return default if v is None else float(v) + + bc = getattr(material, "baseColorFactor", None) + if bc is None: + base_factor = [1.0, 1.0, 1.0, 1.0] + else: + arr = np.asarray(bc, dtype=float).reshape(-1) + if arr.max(initial=1.0) > 1.0: + arr = arr / 255.0 + base_factor = [float(arr[0]), float(arr[1]), float(arr[2])] + base_factor.append(float(arr[3]) if len(arr) >= 4 else 1.0) + metallic_factor = _factor("metallicFactor", 1.0) + roughness_factor = _factor("roughnessFactor", 1.0) + + ns = max(1.0, min(1000.0, (1.0 - roughness_factor) * 1000.0)) + lines = [ + "# Exported from Hunyuan3D Pro GLB", + "# PBR note: glTF metallicRoughnessTexture stores roughness in G " + "and metallic in B.", + f"newmtl {material_name}", + f"Ka {base_factor[0]:.8g} {base_factor[1]:.8g} {base_factor[2]:.8g}", + f"Kd {base_factor[0]:.8g} {base_factor[1]:.8g} {base_factor[2]:.8g}", + "Ks 0 0 0", + f"Ns {ns:.8g}", + f"d {base_factor[3]:.8g}", + "illum 2", + f"Pm {metallic_factor:.8g}", + f"Pr {roughness_factor:.8g}", + ] + if saved["baseColor"]: + lines.append(f"map_Kd {files['baseColor']}") + if saved["normal"]: + lines.append(f"norm {files['normal']}") + lines.append(f"bump {files['normal']}") + if saved["metallic"]: + lines.append(f"map_Pm {files['metallic']}") + if saved["roughness"]: + lines.append(f"map_Pr {files['roughness']}") + with open(mtl_path, "w", encoding="utf-8") as fh: + fh.write("\n".join(lines) + "\n") + + metadata = { + "source": glb_path, + "obj": os.path.basename(obj_path), + "mtl": os.path.basename(mtl_path), + "material": material_name, + "geometry": geom_name, + "alignment": "recenter_to_bbox_origin", + "bakedTransform": np.asarray(baked_xform).tolist(), + "sourceSceneBounds": np.asarray(scene.bounds).tolist(), + "exportedObjBounds": np.asarray(mesh.bounds).tolist(), + "baseColorFactor": base_factor, + "metallicFactor": metallic_factor, + "roughnessFactor": roughness_factor, + "textureMaxEdge": texture_size, + "textures": {k: (files[k] if saved[k] else None) for k in files}, + } + with open(json_path, "w", encoding="utf-8") as fh: + fh.write(json.dumps(metadata, indent=2) + "\n") + + return obj_path + + +def _ship_scaled_pbr_artefacts( + aligned_glb: str, + urdf_path: str, + output_root: str, + final_mesh_dir: str, + asset_name: str, +) -> None: + """Write scaled OBJ companions + GLB with full PBR into ``final_mesh_dir``. + + URDFGen's trimesh roundtrip drops Hunyuan's metallic/roughness/normal + maps; we restore PBR fidelity by: + + 1. Inferring the scale factor URDFGen applied by comparing the + scaled OBJ's extent with the aligned source GLB's extent (the + URDF ```` element stores ``real_height`` instead, which is + a different quantity). + 2. Loading the aligned full-PBR source GLB, scaling it, and writing + it next to URDFGen's OBJ so both share the same scale. + 3. Copying the 4 PBR PNGs from ``output_root`` into the mesh dir. + 4. Patching URDFGen's ``material.mtl`` so Phong rendering has a + visible specular highlight and PBR-aware OBJ importers pick up + ``map_Pm`` / ``map_Pr`` / ``norm`` / ``bump`` references. + """ + # Derive the actual scale factor from URDFGen's scaled OBJ rather than + # the URDF ```` element (which stores real_height — a midpoint + # value distinct from the geometric scaling factor URDFGen applied). + urdfgen_obj = trimesh.load( + os.path.join(final_mesh_dir, f"{asset_name}.obj"), + force="mesh", + process=False, + ) + target_max = float(urdfgen_obj.extents.max()) + + scene = trimesh.load(aligned_glb, force="scene", process=False) + mesh, _, _ = _bake_scene_transform(scene) + V = np.asarray(mesh.vertices, dtype=np.float32) + src_max = float((V.max(axis=0) - V.min(axis=0)).max()) + scale = target_max / src_max if src_max > 1e-9 else 1.0 + mesh.vertices = V * scale + mesh.export(os.path.join(final_mesh_dir, f"{asset_name}.glb")) + + pbr_pngs = { + "metallic": f"{asset_name}_metallic.png", + "roughness": f"{asset_name}_roughness.png", + "normal": f"{asset_name}_normal.png", + } + base_color_png = f"{asset_name}_baseColor.png" + pbr_json = f"{asset_name}_pbr_material.json" + for fname in ( + base_color_png, + pbr_pngs["metallic"], + pbr_pngs["roughness"], + pbr_pngs["normal"], + pbr_json, + ): + src = os.path.join(output_root, fname) + if os.path.exists(src): + copy(src, os.path.join(final_mesh_dir, fname)) + + mtl_path = os.path.join(final_mesh_dir, "material.mtl") + if os.path.exists(mtl_path): + with open(mtl_path) as fh: + mtl_text = fh.read() + # trimesh's OBJ exporter writes ``Ks 0 0 0`` + ``Ns 1`` which makes + # Blender's OBJ Phong path render the surface as flat matte. Bump + # specular and shininess so the OBJ has visible highlights matching + # the PBR GLB, then append the PBR texture map references that + # PBR-aware OBJ importers (Blender 3.6+, others) will pick up. + mtl_text = re.sub( + r"^Ks\s.+$", "Ks 0.5 0.5 0.5", mtl_text, flags=re.MULTILINE + ) + mtl_text = re.sub(r"^Ns\s.+$", "Ns 250", mtl_text, flags=re.MULTILINE) + if not re.search(r"^illum\s", mtl_text, re.MULTILINE): + mtl_text = mtl_text.rstrip() + "\nillum 2\n" + extras = [] + if os.path.exists(os.path.join(final_mesh_dir, pbr_pngs["metallic"])): + extras.append(f"map_Pm {pbr_pngs['metallic']}") + if os.path.exists(os.path.join(final_mesh_dir, pbr_pngs["roughness"])): + extras.append(f"map_Pr {pbr_pngs['roughness']}") + if os.path.exists(os.path.join(final_mesh_dir, pbr_pngs["normal"])): + extras.append(f"norm {pbr_pngs['normal']}") + extras.append(f"bump {pbr_pngs['normal']}") + if extras and not any( + line in mtl_text for line in ("map_Pm", "map_Pr", "norm ") + ): + mtl_text = mtl_text.rstrip() + "\n" + "\n".join(extras) + "\n" + with open(mtl_path, "w") as fh: + fh.write(mtl_text) + + +def _build_asset_attrs(args, idx: int) -> dict: + """Build the URDF asset_attrs dict from CLI args.""" + attrs = {"version": args.version or VERSION} + if args.height_range: + lo, hi = map(float, args.height_range.split("-")) + attrs["min_height"], attrs["max_height"] = lo, hi + if args.mass_range: + lo, hi = map(float, args.mass_range.split("-")) + attrs["min_mass"], attrs["max_mass"] = lo, hi + if isinstance(args.asset_type, list) and args.asset_type[idx]: + attrs["category"] = args.asset_type[idx] + return attrs + + +def _render_color_video( + obj_path: str, work_dir: str, filename: str +) -> Optional[str]: + """Render a turntable color mp4 via the shared kaolin renderer. + + Returns the produced mp4 path, or ``None`` on failure (caller logs). + """ + try: + # differentiable_render hardcodes mp4 fps=15; 90 frames -> 6s, + # matching SAM3D/TRELLIS gs_mesh.mp4 duration. + render_pbr_video( + mesh_path=obj_path, + output_root=work_dir, + uuid=[filename], + num_images=90, + elevation=[20.0], + distance=5.0, + fov=30.0, + with_mtl=True, + gen_color_mp4=True, + no_index_file=True, + ) + mp4 = os.path.join(work_dir, filename, "color.mp4") + return mp4 if os.path.exists(mp4) else None + except Exception as exc: # pragma: no cover - rendering is optional + logger.warning(f"HUNYUAN3D video render failed: {exc}") + return None + + +def _process_glb( + args, + idx: int, + output_root: str, + filename: str, + cfg: HunyuanConfig, + checkers: list, + log_label: str, + seg_input_pair: Optional[Tuple[str, str]] = None, + pre_align_rotation: Optional[np.ndarray] = None, +) -> str: + """GLB-to-result post-processing shared by image and text paths. + + Expects an aligned full-PBR GLB at ``{output_root}/{filename}.glb``. + Runs ``export_glb_to_obj`` → video render → URDFGen → PBR fidelity + fixup → single-arg quality checks (skipped when ``checkers`` is empty) + → ``result/`` organization. ``seg_input_pair`` lets the image path + feed raw/cond images to ``ImageSegChecker``; text path passes ``None``. + ``pre_align_rotation`` (3x3) is folded into the single mesh transform + inside ``export_glb_to_obj``, avoiding a separate load/export pass. + Returns the result dir path. + """ + export_glb_to_obj( + glb_path=os.path.join(output_root, f"{filename}.glb"), + output_dir=output_root, + asset_name=filename, + texture_size=cfg.texture_size, + pre_align_rotation=pre_align_rotation, + ) + mesh_obj_path = os.path.join(output_root, f"{filename}.obj") + + video_path = _render_color_video( + mesh_obj_path, os.path.join(output_root, "_video"), filename + ) + + urdf_convertor = URDFGenerator( + GPT_CLIENT, + render_view_num=4, + decompose_convex=not args.disable_decompose_convex, + ) + urdf_root = f"{output_root}/URDF_{filename}" + urdf_path = urdf_convertor( + mesh_path=mesh_obj_path, + output_root=urdf_root, + **_build_asset_attrs(args, idx), + ) + + # Final mesh dir: keep URDFGen's scaled OBJ + collision, restore full + # PBR fidelity that URDFGen's simple trimesh roundtrip strips (rescaled + # source GLB + PBR map refs appended to material.mtl). + final_mesh_dir = f"{urdf_root}/{urdf_convertor.output_mesh_dir}" + _ship_scaled_pbr_artefacts( + aligned_glb=os.path.join(output_root, f"{filename}.glb"), + urdf_path=urdf_path, + output_root=output_root, + final_mesh_dir=final_mesh_dir, + asset_name=filename, + ) + + # Quality checks: only the single-arg (BaseChecker.validate) ones go + # here. Two-arg checkers like TextGenAlignChecker run in the caller. + if checkers: + render_image_paths = glob( + f"{urdf_root}/{urdf_convertor.output_render_dir}/image_color/*.png" + ) + images_list = [] + for ch in checkers: + if isinstance(ch, ImageSegChecker) and seg_input_pair is not None: + images_list.append(list(seg_input_pair)) + else: + images_list.append(combine_images_to_grid(render_image_paths)) + qa_results = BaseChecker.validate(checkers, images_list) + urdf_convertor.add_quality_tag(urdf_path, qa_results) + + # Organize result/ (no gs.ply; video.mp4 included when render OK). + result_dir = f"{output_root}/result" + if os.path.exists(result_dir): + rmtree(result_dir, ignore_errors=True) + os.makedirs(result_dir, exist_ok=True) + copy(urdf_path, f"{result_dir}/{os.path.basename(urdf_path)}") + copytree( + f"{urdf_root}/{urdf_convertor.output_mesh_dir}", + f"{result_dir}/{urdf_convertor.output_mesh_dir}", + ) + if video_path and os.path.exists(video_path): + copy(video_path, f"{result_dir}/video.mp4") + + if not args.keep_intermediate: + delete_dir(output_root, keep_subs=["result"]) + + logger.info(f"Saved results for {log_label} in {result_dir}") + return result_dir + + +# Rotation that aligns a Hunyuan3D **text**-to-3D GLB with the **image**-to-3D +# frame. -90° around the file-coord up axis (Y), i.e. x' = z, z' = -x. +# Applied as a single multiplication inside ``export_glb_to_obj`` so the +# text path does not need a separate GLB load/save pass. +TEXT_TO_IMAGE_FRAME_ROTATION = np.array( + [[0.0, 0.0, -1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]], + dtype=np.float32, +) + + +def _acquire_or_reuse_glb( + output_root: str, + filename: str, + cfg: HunyuanConfig, + hunyuan_credentials: Optional[Tuple[str, str]], + *, + image_path: Optional[str] = None, + prompt: Optional[str] = None, +) -> None: + """Ensure ``{output_root}/{filename}.glb`` exists. + + Reuses an existing GLB at that path (dev fixture short-circuit) or + calls :func:`acquire_pro_glb` with ``image_path`` or ``prompt``. + """ + glb_path = os.path.join(output_root, f"{filename}.glb") + if os.path.exists(glb_path): + logger.info( + "HUNYUAN3D reusing existing GLB at %s; skipping Tencent API call.", + glb_path, + ) + return + creds = hunyuan_credentials or load_credentials() + acquire_pro_glb( + image_path=image_path, + prompt=prompt, + output_dir=output_root, + asset_name=filename, + credentials=creds, + cfg=cfg, + ) + + +def process_image( + args, + idx: int, + image_path: str, + output_root: str, + filename: str, + hunyuan_config: Optional[HunyuanConfig], + hunyuan_credentials: Optional[Tuple[str, str]], + checkers: list, +) -> None: + """HUNYUAN3D image-to-3D entry: image → GLB → export → URDF → result/.""" + cfg = hunyuan_config or HunyuanConfig() + _acquire_or_reuse_glb( + output_root, filename, cfg, hunyuan_credentials, image_path=image_path + ) + _process_glb( + args=args, + idx=idx, + output_root=output_root, + filename=filename, + cfg=cfg, + checkers=checkers, + log_label=image_path, + seg_input_pair=( + f"{output_root}/{filename}_raw.png", + f"{output_root}/{filename}_cond.png", + ), + ) + + +def process_prompt( + args, + idx: int, + prompt: str, + output_root: str, + filename: str, + hunyuan_config: Optional[HunyuanConfig], + hunyuan_credentials: Optional[Tuple[str, str]], + checkers: list, +) -> None: + """HUNYUAN3D text-to-3D entry: prompt → GLB → export → URDF → result/. + + Text path skips ``text-to-image`` entirely; ``checkers`` should only + contain single-arg (``BaseChecker.validate``-compatible) checkers. + Two-arg checkers like ``TextGenAlignChecker`` should be invoked by + the caller after this returns. + """ + cfg = hunyuan_config or HunyuanConfig() + _acquire_or_reuse_glb( + output_root, filename, cfg, hunyuan_credentials, prompt=prompt + ) + # Text endpoint sits 90° offset around the up axis vs the image + # endpoint; fold the alignment rotation into export_glb_to_obj's + # single mesh-transform pass to avoid a separate GLB roundtrip. + _process_glb( + args=args, + idx=idx, + output_root=output_root, + filename=filename, + cfg=cfg, + checkers=checkers, + log_label=f"prompt={prompt!r}", + seg_input_pair=None, + pre_align_rotation=TEXT_TO_IMAGE_FRAME_ROTATION, + ) diff --git a/embodied_gen/scripts/imageto3d.py b/embodied_gen/scripts/imageto3d.py index 13a1193..07aa76b 100644 --- a/embodied_gen/scripts/imageto3d.py +++ b/embodied_gen/scripts/imageto3d.py @@ -29,6 +29,11 @@ # from embodied_gen.models.sr_model import ImageRealESRGAN # from embodied_gen.models.delight_model import DelightingModel from embodied_gen.models.gs_model import GaussianOperator +from embodied_gen.models.hunyuan3d import ( + HunyuanConfig, + load_credentials, + process_image, +) from embodied_gen.models.segment_model import RembgRemover from embodied_gen.scripts.render_gs import entrypoint as render_gs_api from embodied_gen.utils.gpt_clients import GPT_CLIENT @@ -49,19 +54,47 @@ from embodied_gen.validators.urdf_convertor import URDFGenerator # random.seed(0) -IMAGE3D_MODEL = "SAM3D" # TRELLIS or SAM3D -logger.info(f"Loading {IMAGE3D_MODEL} as Image3D Models...") -if IMAGE3D_MODEL == "TRELLIS": - from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline - - PIPELINE = TrellisImageTo3DPipeline.from_pretrained( - "microsoft/TRELLIS-image-large" - ) - # PIPELINE.cuda() -elif IMAGE3D_MODEL == "SAM3D": - from embodied_gen.models.sam3d import Sam3dInference +IMAGE3D_MODEL = "SAM3D" # default backend; SAM3D, TRELLIS, or HUNYUAN3D +SUPPORTED_IMAGE3D_MODELS = ("SAM3D", "TRELLIS", "HUNYUAN3D") + + +_PIPELINE_CACHE: dict = {} + + +def _build_image3d_pipeline(name: str): + """Lazily instantiate (and cache) the local image-to-3D pipeline. + + The cache preserves the pre-refactor invariant that the local backend + is loaded once per process: ``textto3d.py`` calls ``entrypoint`` in a + per-node loop, and re-loading weights each call would regress runtime. + Returns ``None`` for backends that have no local model (HUNYUAN3D). + """ + if name == "HUNYUAN3D": + return None + if name in _PIPELINE_CACHE: + return _PIPELINE_CACHE[name] + if name == "TRELLIS": + logger.info("Loading TRELLIS as Image3D Models...") + from thirdparty.TRELLIS.trellis.pipelines import ( + TrellisImageTo3DPipeline, + ) + + pipeline = TrellisImageTo3DPipeline.from_pretrained( + "microsoft/TRELLIS-image-large" + ) + elif name == "SAM3D": + logger.info("Loading SAM3D as Image3D Models...") + from embodied_gen.models.sam3d import Sam3dInference + + pipeline = Sam3dInference() + else: + raise ValueError( + f"Unsupported image3d backend {name!r}; " + f"expected one of {SUPPORTED_IMAGE3D_MODELS}." + ) + _PIPELINE_CACHE[name] = pipeline + return pipeline - PIPELINE = Sam3dInference() # DELIGHT = DelightingModel() # IMAGESR_MODEL = ImageRealESRGAN(outscale=4) @@ -109,6 +142,17 @@ def parse_args(): ) parser.add_argument("--disable_decompose_convex", action="store_true") parser.add_argument("--texture_size", type=int, default=2048) + parser.add_argument( + "--image3d_model", + type=str, + default=IMAGE3D_MODEL, + help=( + "Image-to-3D backend. One of " + f"{', '.join(SUPPORTED_IMAGE3D_MODELS)} (case-insensitive). " + "HUNYUAN3D calls Tencent Hunyuan3D Pro API and requires " + "TENCENT_SECRET_ID/TENCENT_SECRET_KEY in the environment." + ), + ) args, unknown = parser.parse_known_args() return args @@ -120,6 +164,28 @@ def entrypoint(**kwargs): if hasattr(args, k) and v is not None: setattr(args, k, v) + args.image3d_model = str(args.image3d_model).strip().upper() + if args.image3d_model not in SUPPORTED_IMAGE3D_MODELS: + raise ValueError( + f"Unsupported --image3d_model {args.image3d_model!r}; " + f"expected one of {SUPPORTED_IMAGE3D_MODELS}." + ) + + hunyuan_config = None + hunyuan_credentials = None + if args.image3d_model == "HUNYUAN3D": + # Fail fast on missing creds before any local model load or network I/O. + hunyuan_credentials = load_credentials() + hunyuan_config = HunyuanConfig() + logger.info( + "HUNYUAN3D backend: action=%s host=%s result_format=%s", + hunyuan_config.image_action, + hunyuan_config.host, + hunyuan_config.result_format, + ) + + pipeline = _build_image3d_pipeline(args.image3d_model) + assert ( args.image_path or args.image_root ), "Please provide either --image_path or --image_root." @@ -151,6 +217,19 @@ def entrypoint(**kwargs): seg_image = RBG_REMOVER(image) if image.mode != "RGBA" else image seg_image.save(seg_path) + if args.image3d_model == "HUNYUAN3D": + process_image( + args=args, + idx=idx, + image_path=image_path, + output_root=output_root, + filename=filename, + hunyuan_config=hunyuan_config, + hunyuan_credentials=hunyuan_credentials, + checkers=CHECKERS, + ) + continue + seed = args.seed asset_node = "unknown" gs_model = None @@ -161,7 +240,7 @@ def entrypoint(**kwargs): f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}" ) try: - outputs = image3d_model_infer(PIPELINE, seg_image, seed) + outputs = image3d_model_infer(pipeline, seg_image, seed) except Exception as e: logger.error( f"[Image3D Failed] process {image_path}: {e}, retry: {try_idx+1}/{args.n_retry}" diff --git a/embodied_gen/scripts/textto3d.py b/embodied_gen/scripts/textto3d.py index c5fe5f3..1427cea 100644 --- a/embodied_gen/scripts/textto3d.py +++ b/embodied_gen/scripts/textto3d.py @@ -17,6 +17,7 @@ import argparse import os import random +import types from collections import defaultdict import numpy as np @@ -25,6 +26,10 @@ from embodied_gen.models.image_comm_model import build_hf_image_pipeline from embodied_gen.models.segment_model import RembgRemover from embodied_gen.models.text_model import PROMPT_APPEND +from embodied_gen.scripts.imageto3d import ( + IMAGE3D_MODEL, + SUPPORTED_IMAGE3D_MODELS, +) from embodied_gen.scripts.imageto3d import entrypoint as imageto3d_api from embodied_gen.utils.gpt_clients import GPT_CLIENT from embodied_gen.utils.log import logger @@ -43,12 +48,32 @@ os.environ["TOKENIZERS_PARALLELISM"] = "false" random.seed(0) -logger.info("Loading TEXT2IMG_MODEL...") -SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT) -SEG_CHECKER = ImageSegChecker(GPT_CLIENT) +# TXTGEN_CHECKER drives the final text↔3D quality gate for every backend +# (SAM3D / TRELLIS / HUNYUAN3D), so it stays eager. TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT) -PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35")) -BG_REMOVER = RembgRemover() + +# The text-to-image stack (PIPE_IMG, BG_REMOVER, SEMANTIC_CHECKER, SEG_CHECKER) +# is only used on the SAM3D / TRELLIS path. HUNYUAN3D goes directly from prompt to 3D +SEMANTIC_CHECKER = None +SEG_CHECKER = None +PIPE_IMG = None +BG_REMOVER = None + + +def _ensure_text2img_stack() -> None: + """Construct the text-to-image pipeline + image-stage checkers once. + + Called from the SAM3D / TRELLIS path before any ``text_to_image`` run. + Idempotent: subsequent calls return immediately. + """ + global SEMANTIC_CHECKER, SEG_CHECKER, PIPE_IMG, BG_REMOVER + if PIPE_IMG is not None: + return + logger.info("Loading TEXT2IMG_MODEL...") + SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT) + SEG_CHECKER = ImageSegChecker(GPT_CLIENT) + PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35")) + BG_REMOVER = RembgRemover() __all__ = [ @@ -66,6 +91,7 @@ def text_to_image( image_hw: tuple[int, int] = (1024, 1024), seed: int = None, ) -> bool: + _ensure_text2img_stack() select_image = None success_flag = False assert save_path.endswith(".png"), "Image save path must end with `.png`." @@ -130,48 +156,114 @@ def text_to_3d(**kwargs) -> dict: if hasattr(args, k) and v is not None: setattr(args, k, v) + args.image3d_model = str(args.image3d_model).strip().upper() + if args.image3d_model not in SUPPORTED_IMAGE3D_MODELS: + raise ValueError( + f"Unsupported --image3d_model {args.image3d_model!r}; " + f"expected one of {SUPPORTED_IMAGE3D_MODELS}." + ) + + hunyuan_cfg = None + hunyuan_creds = None + process_prompt = None + if args.image3d_model == "HUNYUAN3D": + from embodied_gen.models.hunyuan3d import ( + HunyuanConfig, + load_credentials, + process_prompt, + ) + + # Fail fast on missing creds before any network I/O. + hunyuan_creds = load_credentials() + hunyuan_cfg = HunyuanConfig() + logger.info( + "HUNYUAN3D text-to-3D backend: action=%s host=%s result_format=%s", + hunyuan_cfg.image_action, + hunyuan_cfg.host, + hunyuan_cfg.result_format, + ) + if args.asset_names is None or len(args.asset_names) == 0: args.asset_names = [f"sample3d_{i}" for i in range(len(args.prompts))] - img_save_dir = os.path.join(args.output_root, "images") asset_save_dir = os.path.join(args.output_root, "asset3d") - os.makedirs(img_save_dir, exist_ok=True) os.makedirs(asset_save_dir, exist_ok=True) + # HUNYUAN3D path skips text-to-image entirely; the images/ dir only + # exists when the local SAM3D / TRELLIS pipeline produces conditioning + # images. + img_save_dir = os.path.join(args.output_root, "images") + if args.image3d_model != "HUNYUAN3D": + os.makedirs(img_save_dir, exist_ok=True) results = defaultdict(dict) - for prompt, node in zip(args.prompts, args.asset_names): + for idx, (prompt, node) in enumerate(zip(args.prompts, args.asset_names)): success_flag = False n_pipe_retry = args.n_pipe_retry seed_img = args.seed_img seed_3d = args.seed_3d + # Tencent Pro API is charged per submit; force a single attempt to + # avoid silently multiplying cost when --n_pipe_retry > 1. + if args.image3d_model == "HUNYUAN3D" and n_pipe_retry > 1: + logger.warning( + "HUNYUAN3D mode: --n_pipe_retry forced to 1 (Tencent API " + "is charged per submit); user passed %d.", + n_pipe_retry, + ) + n_pipe_retry = 1 while success_flag is False and n_pipe_retry > 0: logger.info( f"GEN pipeline for node {node}\n" f"Try round: {args.n_pipe_retry-n_pipe_retry+1}/{args.n_pipe_retry}, Prompt: {prompt}" ) - # Text-to-image GEN save_node = node.replace(" ", "_") - gen_image_path = f"{img_save_dir}/{save_node}.png" - textgen_flag = text_to_image( - prompt, - gen_image_path, - args.n_image_retry, - args.img_denoise_step, - args.text_guidance_scale, - args.n_img_sample, - seed=seed_img, - ) - - # Asset 3D GEN node_save_dir = f"{asset_save_dir}/{save_node}" asset_type = node if "sample3d_" not in node else None - imageto3d_api( - image_path=[gen_image_path], - output_root=node_save_dir, - asset_type=[asset_type], - seed=random.randint(0, 100000) if seed_3d is None else seed_3d, - n_retry=args.n_asset_retry, - keep_intermediate=args.keep_intermediate, - disable_decompose_convex=args.disable_decompose_convex, - ) + + if args.image3d_model == "HUNYUAN3D": + hunyuan_args = types.SimpleNamespace( + asset_type=[asset_type], + version=None, + height_range=None, + mass_range=None, + disable_decompose_convex=args.disable_decompose_convex, + keep_intermediate=args.keep_intermediate, + ) + process_prompt( + args=hunyuan_args, + idx=0, + prompt=prompt, + output_root=node_save_dir, + filename=save_node, + hunyuan_config=hunyuan_cfg, + hunyuan_credentials=hunyuan_creds, + checkers=[], + ) + else: + # Text-to-image GEN (SAM3D / TRELLIS path). + gen_image_path = f"{img_save_dir}/{save_node}.png" + text_to_image( + prompt, + gen_image_path, + args.n_image_retry, + args.img_denoise_step, + args.text_guidance_scale, + args.n_img_sample, + seed=seed_img, + ) + + # Asset 3D GEN + imageto3d_api( + image_path=[gen_image_path], + output_root=node_save_dir, + asset_type=[asset_type], + seed=( + random.randint(0, 100000) + if seed_3d is None + else seed_3d + ), + n_retry=args.n_asset_retry, + keep_intermediate=args.keep_intermediate, + disable_decompose_convex=args.disable_decompose_convex, + image3d_model=args.image3d_model, + ) mesh_path = f"{node_save_dir}/result/mesh/{save_node}.obj" image_path = render_asset3d( mesh_path, @@ -272,6 +364,18 @@ def parse_args(): ) parser.add_argument("--keep_intermediate", action="store_true") parser.add_argument("--disable_decompose_convex", action="store_true") + parser.add_argument( + "--image3d_model", + type=str, + default=IMAGE3D_MODEL, + help=( + "Image-to-3D backend selector forwarded to imageto3d. One of " + f"{', '.join(SUPPORTED_IMAGE3D_MODELS)} (case-insensitive). " + "HUNYUAN3D skips the text-to-image stage entirely and calls " + "Tencent Hunyuan3D Pro text-to-3D directly; it requires " + "TENCENT_SECRET_ID/TENCENT_SECRET_KEY in the environment." + ), + ) args, unknown = parser.parse_known_args() diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py index 6c770f6..f24417d 100644 --- a/embodied_gen/validators/urdf_convertor.py +++ b/embodied_gen/validators/urdf_convertor.py @@ -75,7 +75,7 @@ 0.0 0.0 "-1" - "" + @@ -132,9 +132,7 @@ def __init__( view_desc = "This is the rendered views " if prompt_template is None: - prompt_template = ( - view_desc - + """of the 3D object asset, + prompt_template = view_desc + """of the 3D object asset, category: {category}. You are an expert in 3D object analysis and physical property estimation. Give the category of this object asset (within 3 words), (if category is @@ -176,7 +174,6 @@ def __init__( Assume the object is in real-world scale and estimate the approximate vertical height based on the pose estimation and how large it appears vertically in the first image. """ - ) self.prompt_template = prompt_template if attrs_name is None: