diff --git a/docs/tutorials/image_to_3d.md b/docs/tutorials/image_to_3d.md
index 0081d6b..9233ade 100644
--- a/docs/tutorials/image_to_3d.md
+++ b/docs/tutorials/image_to_3d.md
@@ -5,7 +5,13 @@ Generate **physically plausible 3D assets** from a single input image, supportin
---
## ⚡ Command-Line Usage
-Support the use of [SAM3D](https://github.com/facebookresearch/sam-3d-objects) or [TRELLIS](https://github.com/microsoft/TRELLIS) as 3D generation model, modify `IMAGE3D_MODEL` in `embodied_gen/scripts/imageto3d.py` to switch model.
+Three 3D generation backends are supported:
+
+- [`SAM3D`](https://github.com/facebookresearch/sam-3d-objects) — local model (default)
+- [`TRELLIS`](https://github.com/microsoft/TRELLIS) — local model
+- `HUNYUAN3D` — Tencent Hunyuan3D Pro cloud API (no local GPU model needed)
+
+Select the backend via `--image3d_model` (case-insensitive). Omit to use the default `SAM3D`.
```bash
img3d-cli --image_path apps/assets/example_image/sample_00.jpg \
@@ -13,6 +19,18 @@ apps/assets/example_image/sample_01.jpg \
--n_retry 2 --output_root outputs/imageto3d
```
+### Using the Hunyuan3D Cloud Backend
+
+Hunyuan3D Pro runs entirely on Tencent Cloud — useful when you don't have a local GPU. It requires Tencent Cloud Hunyuan3D `SecretId` / `SecretKey` and network access to `ai3d.tencentcloudapi.com` and the COS download host.
+
+```bash
+export TENCENT_SECRET_ID='your-secret-id'
+export TENCENT_SECRET_KEY='your-secret-key'
+img3d-cli --image3d_model HUNYUAN3D \
+ --image_path apps/assets/example_image/sample_00.jpg \
+ --output_root outputs/imageto3d_hunyuan
+```
+
You will get the following results:
diff --git a/docs/tutorials/text_to_3d.md b/docs/tutorials/text_to_3d.md
index 0c4b0dc..44e083c 100644
--- a/docs/tutorials/text_to_3d.md
+++ b/docs/tutorials/text_to_3d.md
@@ -80,6 +80,27 @@ bash embodied_gen/scripts/textto3d.sh \
> Models with more permissive licenses can be found in `embodied_gen/models/image_comm_model.py`.
+### Choosing the 3D Backend
+
+Three 3D generation backends are supported via `--image3d_model` (case-insensitive):
+
+- `SAM3D` (default) — text → image → 3D, local SAM3D model
+- `TRELLIS` — text → image → 3D, local TRELLIS model
+- `HUNYUAN3D` — Tencent Hunyuan3D Pro **text-to-3D** API; skips the text-to-image stage entirely and generates 3D directly from the prompt
+
+### Using the Hunyuan3D Cloud Backend
+
+Hunyuan3D Pro takes the prompt directly to a 3D mesh (no GPU model loaded locally; one job ≈ 3 minutes; Tencent Cloud is billed per submit). Set up credentials once:
+
+```bash
+export TENCENT_SECRET_ID='your-secret-id'
+export TENCENT_SECRET_KEY='your-secret-key'
+text3d-cli --image3d_model HUNYUAN3D \
+ --prompts "small bronze figurine of a lion" \
+ --output_root outputs/textto3d_hunyuan
+```
+
+
The generated results are organized as follows:
```sh
diff --git a/embodied_gen/data/backproject_v3.py b/embodied_gen/data/backproject_v3.py
index 81cea59..2c053c3 100644
--- a/embodied_gen/data/backproject_v3.py
+++ b/embodied_gen/data/backproject_v3.py
@@ -419,7 +419,7 @@ def parse_args():
parser.add_argument(
"--save_glb_path", type=str, default=None, help="Save glb path."
)
- parser.add_argument("--n_max_faces", type=int, default=30000)
+ parser.add_argument("--n_max_faces", type=int, default=50000)
args, unknown = parser.parse_known_args()
return args
diff --git a/embodied_gen/data/utils.py b/embodied_gen/data/utils.py
index 74f96c6..f50bcaf 100644
--- a/embodied_gen/data/utils.py
+++ b/embodied_gen/data/utils.py
@@ -727,6 +727,7 @@ def save_mesh_with_mtl(
output_path: str,
material_base=(250, 250, 250, 255),
mesh_process: bool = True,
+ glossiness: float = 250.0,
) -> trimesh.Trimesh:
if isinstance(texture, np.ndarray):
texture = Image.fromarray(texture)
@@ -742,6 +743,8 @@ def save_mesh_with_mtl(
diffuse=material_base,
ambient=material_base,
specular=material_base,
+ # 250 gives a tight visible highlight similar to glossy plastic.
+ glossiness=glossiness,
)
dir_name = os.path.dirname(output_path)
diff --git a/embodied_gen/models/hunyuan3d.py b/embodied_gen/models/hunyuan3d.py
new file mode 100644
index 0000000..21102f8
--- /dev/null
+++ b/embodied_gen/models/hunyuan3d.py
@@ -0,0 +1,957 @@
+# Project EmbodiedGen
+#
+# Copyright (c) 2025 Horizon Robotics. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+# implied. See the License for the specific language governing
+# permissions and limitations under the License.
+
+from __future__ import annotations
+
+import base64
+import hashlib
+import hmac
+import json
+import os
+import re
+import socket
+import time
+import urllib.error
+import urllib.request
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from glob import glob
+from http.client import HTTPSConnection
+from shutil import copy, copytree, rmtree
+from typing import Optional, Tuple
+
+import numpy as np
+import trimesh
+from PIL import Image
+from embodied_gen.data.differentiable_render import (
+ entrypoint as render_pbr_video,
+)
+from embodied_gen.data.utils import delete_dir
+from embodied_gen.utils.gpt_clients import GPT_CLIENT
+from embodied_gen.utils.log import logger
+from embodied_gen.utils.process_media import combine_images_to_grid
+from embodied_gen.utils.tags import VERSION
+from embodied_gen.validators.quality_checkers import (
+ BaseChecker,
+ ImageSegChecker,
+)
+from embodied_gen.validators.urdf_convertor import URDFGenerator
+
+
+@dataclass(frozen=True)
+class HunyuanConfig:
+ """Tencent Hunyuan3D Pro endpoint + timing.
+
+ Defaults match the validated probe in ``outputs/hunyuan3d_api_expert/``.
+ Only the Pro action set is supported.
+ """
+
+ host: str = "ai3d.tencentcloudapi.com"
+ service: str = "ai3d"
+ region: str = "ap-guangzhou"
+ version: str = "2025-05-13"
+ image_action: str = "SubmitHunyuanTo3DProJob"
+ query_action: str = "QueryHunyuanTo3DProJob"
+ result_format: str = "GLB"
+ texture_size: int = 2048
+ connect_timeout: float = 10.0
+ read_timeout: float = 60.0
+ poll_interval: float = 10.0
+ max_wait_seconds: float = 900.0
+ max_download_bytes: int = 512 * 1024 * 1024
+
+
+def load_credentials() -> Tuple[str, str]:
+ """Read Tencent Cloud SecretId/SecretKey from environment.
+
+ Prefers ``TENCENT_SECRET_ID/KEY``; falls back to ``TENCENTCLOUD_*``.
+ Raises ``RuntimeError`` (credential-free message) when missing.
+ """
+ sid = os.environ.get("TENCENT_SECRET_ID") or os.environ.get(
+ "TENCENTCLOUD_SECRET_ID"
+ )
+ skey = os.environ.get("TENCENT_SECRET_KEY") or os.environ.get(
+ "TENCENTCLOUD_SECRET_KEY"
+ )
+ if not sid or not skey:
+ raise RuntimeError(
+ "HUNYUAN3D backend requires Tencent Cloud credentials. Set "
+ "TENCENT_SECRET_ID and TENCENT_SECRET_KEY (or TENCENTCLOUD_*) "
+ "in the environment, e.g. `source .secrets/hunyuan3d.env`."
+ )
+ return sid, skey
+
+
+def _signed_headers(
+ payload: str,
+ action: str,
+ credentials: Tuple[str, str],
+ cfg: HunyuanConfig,
+) -> dict:
+ """Build fresh TC3-HMAC-SHA256 auth headers (re-built every request)."""
+ sid, skey = credentials
+ ts = int(time.time())
+ date = datetime.fromtimestamp(ts, tz=timezone.utc).strftime("%Y-%m-%d")
+ canon_headers = (
+ "content-type:application/json; charset=utf-8\n"
+ f"host:{cfg.host}\n"
+ f"x-tc-action:{action.lower()}\n"
+ )
+ signed = "content-type;host;x-tc-action"
+ canon_req = (
+ f"POST\n/\n\n{canon_headers}\n{signed}\n"
+ f"{hashlib.sha256(payload.encode()).hexdigest()}"
+ )
+ scope = f"{date}/{cfg.service}/tc3_request"
+ string_to_sign = (
+ f"TC3-HMAC-SHA256\n{ts}\n{scope}\n"
+ f"{hashlib.sha256(canon_req.encode()).hexdigest()}"
+ )
+
+ def _sign(key, msg):
+ return hmac.new(key, msg.encode(), hashlib.sha256).digest()
+
+ sd = _sign(("TC3" + skey).encode(), date)
+ ss = _sign(sd, cfg.service)
+ signing = _sign(ss, "tc3_request")
+ sig = hmac.new(
+ signing, string_to_sign.encode(), hashlib.sha256
+ ).hexdigest()
+ return {
+ "Authorization": (
+ f"TC3-HMAC-SHA256 Credential={sid}/{scope}, "
+ f"SignedHeaders={signed}, Signature={sig}"
+ ),
+ "Content-Type": "application/json; charset=utf-8",
+ "Host": cfg.host,
+ "X-TC-Action": action,
+ "X-TC-Timestamp": str(ts),
+ "X-TC-Version": cfg.version,
+ "X-TC-Region": cfg.region,
+ }
+
+
+def _post_signed(
+ payload_obj: dict,
+ action: str,
+ credentials: Tuple[str, str],
+ cfg: HunyuanConfig,
+) -> dict:
+ """POST a Tencent Cloud TC3-signed JSON request and return ``Response``.
+
+ Routes through ``HTTPS_PROXY`` via CONNECT (``HTTPSConnection`` does not
+ honor the env var on its own). Never logs credentials, signed headers,
+ or the request payload (which carries base64 image data).
+ """
+ payload = json.dumps(
+ payload_obj, separators=(",", ":"), ensure_ascii=False
+ )
+ headers = _signed_headers(payload, action, credentials, cfg)
+ # ``http.client.HTTPSConnection`` does NOT auto-honor ``HTTPS_PROXY``
+ # (unlike ``urllib.request.urlopen``); read it explicitly and tunnel
+ # via CONNECT, otherwise direct connections to Tencent Cloud will be
+ # blocked by the corporate egress firewall.
+ proxy = os.environ.get("HTTPS_PROXY") or os.environ.get("HTTP_PROXY")
+ timeout = cfg.connect_timeout + cfg.read_timeout
+ if proxy:
+ from urllib.parse import urlparse
+
+ p = urlparse(proxy)
+ conn = HTTPSConnection(p.hostname, p.port or 80, timeout=timeout)
+ conn.set_tunnel(cfg.host, 443)
+ else:
+ conn = HTTPSConnection(cfg.host, timeout=timeout)
+ try:
+ conn.request("POST", "/", body=payload.encode(), headers=headers)
+ resp = conn.getresponse()
+ status, body = resp.status, resp.read().decode(errors="replace")
+ finally:
+ conn.close()
+
+ if not 200 <= status < 300:
+ raise RuntimeError(
+ f"Hunyuan3D {action} HTTP {status}; len={len(body)}."
+ )
+ try:
+ data = json.loads(body).get("Response", {})
+ except json.JSONDecodeError as exc:
+ raise RuntimeError(
+ f"Hunyuan3D {action} non-JSON (HTTP {status}): {exc}"
+ )
+ err = data.get("Error")
+ if err:
+ raise RuntimeError(
+ f"Hunyuan3D {action} Tencent error: "
+ f"Code={err.get('Code')} Message={err.get('Message')} "
+ f"RequestId={data.get('RequestId')}"
+ )
+ return data
+
+
+def submit_pro_job(
+ image_path: Optional[str] = None,
+ credentials: Tuple[str, str] = None,
+ cfg: HunyuanConfig = None,
+ prompt: Optional[str] = None,
+) -> str:
+ """Submit a Hunyuan3D Pro job, return its ``JobId``.
+
+ Provide exactly one of ``image_path`` (image-to-3D, body field
+ ``ImageBase64``) or ``prompt`` (text-to-3D, body field ``Prompt``).
+ Tencent's ``SubmitHunyuanTo3DProJob`` action is shared between both
+ modes; only the body discriminator differs.
+ """
+ if (image_path is None) == (prompt is None):
+ raise ValueError(
+ "submit_pro_job requires exactly one of image_path or prompt."
+ )
+ if credentials is None or cfg is None:
+ raise ValueError("credentials and cfg are required.")
+
+ payload = {"ResultFormat": cfg.result_format, "EnablePBR": True}
+ if image_path is not None:
+ if not os.path.isfile(image_path):
+ raise FileNotFoundError(
+ f"Hunyuan3D input image missing: {image_path}"
+ )
+ with open(image_path, "rb") as fh:
+ payload["ImageBase64"] = base64.b64encode(fh.read()).decode()
+ mode = "image"
+ else:
+ payload["Prompt"] = prompt
+ mode = "text"
+
+ resp = _post_signed(payload, cfg.image_action, credentials, cfg)
+ job_id = resp.get("JobId")
+ if not job_id:
+ raise RuntimeError(
+ f"Hunyuan3D submit returned no JobId; "
+ f"RequestId={resp.get('RequestId')}."
+ )
+ logger.info(
+ "HUNYUAN3D submit OK (%s): JobId=%s RequestId=%s",
+ mode,
+ job_id,
+ resp.get("RequestId"),
+ )
+ return job_id
+
+
+def wait_for_pro_job(
+ job_id: str,
+ credentials: Tuple[str, str],
+ cfg: HunyuanConfig,
+) -> dict:
+ """Poll the job until DONE; raise on FAIL/unknown/timeout."""
+ deadline = time.time() + cfg.max_wait_seconds
+ last_status = None
+ while True:
+ resp = _post_signed(
+ {"JobId": job_id}, cfg.query_action, credentials, cfg
+ )
+ status = resp.get("Status")
+ if status != last_status:
+ logger.info(
+ "HUNYUAN3D job %s status=%s RequestId=%s",
+ job_id,
+ status,
+ resp.get("RequestId"),
+ )
+ if last_status is None and status in ("WAIT", "RUN"):
+ logger.info(
+ "HUNYUAN3D Pro inference typically takes ~3 minutes; "
+ "polling every %ss.",
+ int(cfg.poll_interval),
+ )
+ last_status = status
+ if status == "DONE":
+ return resp
+ if status == "FAIL":
+ raise RuntimeError(
+ f"Hunyuan3D job {job_id} FAIL: "
+ f"code={resp.get('ErrorCode')} "
+ f"message={resp.get('ErrorMessage')} "
+ f"RequestId={resp.get('RequestId')}."
+ )
+ if status not in ("WAIT", "RUN"):
+ raise RuntimeError(
+ f"Hunyuan3D job {job_id} unknown status={status!r}; "
+ f"RequestId={resp.get('RequestId')}."
+ )
+ if time.time() >= deadline:
+ raise TimeoutError(
+ f"Hunyuan3D job {job_id} did not finish within "
+ f"{cfg.max_wait_seconds}s (last status={status})."
+ )
+ time.sleep(cfg.poll_interval)
+
+
+def _download_url_to_path(url: str, dst: str, cfg: HunyuanConfig) -> int:
+ """Stream ``url`` to ``dst`` with size/timeout caps. Returns bytes written.
+
+ Logs only the host (signed URL paths carry short-lived auth tokens).
+ """
+ from urllib.parse import urlparse
+
+ os.makedirs(os.path.dirname(os.path.abspath(dst)), exist_ok=True)
+ logger.info(
+ "HUNYUAN3D downloading %s from host=%s",
+ os.path.basename(dst),
+ urlparse(url).hostname or "?",
+ )
+ total = 0
+ timeout = cfg.connect_timeout + cfg.read_timeout
+ try:
+ with urllib.request.urlopen(url, timeout=timeout) as resp:
+ if not 200 <= resp.status < 300:
+ raise RuntimeError(
+ f"Hunyuan3D download HTTP {resp.status} for "
+ f"{os.path.basename(dst)}."
+ )
+ with open(dst, "wb") as out:
+ while True:
+ chunk = resp.read(1024 * 1024)
+ if not chunk:
+ break
+ total += len(chunk)
+ if total > cfg.max_download_bytes:
+ raise RuntimeError(
+ f"Hunyuan3D download exceeded "
+ f"{cfg.max_download_bytes} bytes for "
+ f"{os.path.basename(dst)}."
+ )
+ out.write(chunk)
+ except (urllib.error.URLError, socket.timeout) as exc:
+ raise RuntimeError(
+ f"Hunyuan3D download failed for {os.path.basename(dst)}: {exc}"
+ )
+ return total
+
+
+def acquire_pro_glb(
+ image_path: Optional[str] = None,
+ output_dir: str = None,
+ asset_name: str = None,
+ credentials: Tuple[str, str] = None,
+ cfg: HunyuanConfig = None,
+ prompt: Optional[str] = None,
+) -> str:
+ """End-to-end: submit + poll + download GLB into the output dir.
+
+ Provide exactly one of ``image_path`` or ``prompt`` (see
+ :func:`submit_pro_job` for the body-field difference).
+ """
+ if (
+ output_dir is None
+ or asset_name is None
+ or credentials is None
+ or cfg is None
+ ):
+ raise ValueError(
+ "output_dir, asset_name, credentials and cfg are required."
+ )
+ os.makedirs(output_dir, exist_ok=True)
+ glb_path = os.path.join(output_dir, f"{asset_name}.glb")
+
+ job_id = submit_pro_job(
+ image_path=image_path,
+ prompt=prompt,
+ credentials=credentials,
+ cfg=cfg,
+ )
+ resp = wait_for_pro_job(job_id, credentials, cfg)
+
+ files = resp.get("ResultFile3Ds") or []
+ glb_url = next(
+ (
+ f.get("Url")
+ for f in files
+ if (f.get("Type") or "").upper() == "GLB" and f.get("Url")
+ ),
+ None,
+ )
+ if not glb_url:
+ raise RuntimeError(
+ f"Hunyuan3D job {job_id} returned no GLB; "
+ f"RequestId={resp.get('RequestId')}."
+ )
+ _download_url_to_path(glb_url, glb_path, cfg)
+ return glb_path
+
+
+def _texture_array(tex) -> Optional[np.ndarray]:
+ """Return RGB ndarray for a glTF texture, or None if absent/invalid."""
+ if tex is None or not hasattr(tex, "convert"):
+ return None
+ return np.asarray(tex.convert("RGB"))
+
+
+def _save_rgb(arr: Optional[np.ndarray], dst: str, max_edge: int) -> bool:
+ """Save an RGB texture as PNG, capping the longest edge at ``max_edge``."""
+ if arr is None:
+ return False
+ img = Image.fromarray(arr)
+ longest = max(img.size)
+ if longest > max_edge:
+ scale = max_edge / float(longest)
+ img = img.resize(
+ (
+ max(1, int(img.size[0] * scale)),
+ max(1, int(img.size[1] * scale)),
+ ),
+ Image.LANCZOS,
+ )
+ img.save(dst)
+ return True
+
+
+def _bake_scene_transform(
+ scene: trimesh.Scene,
+) -> Tuple[trimesh.Trimesh, np.ndarray, str]:
+ """Apply scene-graph transforms to mesh vertices; return one Trimesh."""
+ if len(scene.graph.nodes_geometry) != 1:
+ parts = []
+ for n in scene.graph.nodes_geometry:
+ xform, gname = scene.graph[n]
+ m = scene.geometry[gname].copy()
+ m.apply_transform(xform)
+ parts.append(m)
+ return trimesh.util.concatenate(parts), np.eye(4), "concatenated"
+ n = next(iter(scene.graph.nodes_geometry))
+ xform, gname = scene.graph[n]
+ mesh = scene.geometry[gname].copy()
+ mesh.apply_transform(xform)
+ return mesh, xform, gname
+
+
+def export_glb_to_obj(
+ glb_path: str,
+ output_dir: str,
+ asset_name: str,
+ texture_size: int = 2048,
+ pre_align_rotation: Optional[np.ndarray] = None,
+) -> str:
+ """Convert a Hunyuan3D Pro GLB into the full-PBR OBJ + MTL + PBR PNGs.
+
+ Bakes the GLB scene transform, optionally applies ``pre_align_rotation``
+ to the vertex array (used by the text-to-3D path, whose endpoint emits
+ a frame rotated 90° around the up axis relative to the image-to-3D
+ endpoint), recenters to the bbox origin (matching SAM3D's convention
+ of putting the model origin at the geometric center), and writes a
+ Blender-compatible OBJ/MTL referencing 4 PBR PNGs (baseColor /
+ metallic / roughness / normal) plus a ``_pbr_material.json`` metadata
+ sidecar. The source GLB at ``glb_path`` is overwritten with the
+ aligned mesh so downstream steps can reuse it. Returns the OBJ path.
+ """
+ from trimesh.exchange.obj import export_obj
+
+ os.makedirs(output_dir, exist_ok=True)
+ obj_path = os.path.join(output_dir, f"{asset_name}.obj")
+ mtl_path = os.path.join(output_dir, f"{asset_name}.mtl")
+ json_path = os.path.join(output_dir, f"{asset_name}_pbr_material.json")
+
+ scene = trimesh.load(glb_path, force="scene", process=False)
+ mesh, baked_xform, geom_name = _bake_scene_transform(scene)
+ material = getattr(getattr(mesh, "visual", None), "material", None)
+
+ # Align to SAM3D convention: optional pre-rotation (text-to-3D needs a
+ # -90° around the up axis to share the image-to-3D frame) + recenter to
+ # the bbox origin. Overwrite the source GLB so downstream steps can
+ # reuse the aligned full-PBR mesh without an extra load/export pass.
+ V = np.asarray(mesh.vertices, dtype=np.float32)
+ if pre_align_rotation is not None:
+ V = V @ np.asarray(pre_align_rotation, dtype=np.float32)
+ bbox_center = (V.min(axis=0) + V.max(axis=0)) * 0.5
+ mesh.vertices = V - bbox_center
+ mesh.export(glb_path)
+
+ raw_name = getattr(material, "name", None) or f"{asset_name}_material"
+ material_name = re.sub(r"[^A-Za-z0-9_.-]+", "_", raw_name).strip("._")
+ if not material_name:
+ material_name = f"{asset_name}_material"
+
+ # Write OBJ (rewrite usemtl so it points at our material, after mtllib).
+ obj_text = export_obj(
+ mesh,
+ include_normals=True,
+ include_color=True,
+ include_texture=True,
+ return_texture=False,
+ write_texture=False,
+ mtl_name=os.path.basename(mtl_path),
+ header=(
+ "Exported from Hunyuan3D Pro GLB; "
+ "scene transform baked, recentered to bbox origin"
+ ),
+ )
+ obj_text = re.sub(
+ r"^usemtl\s+.+$",
+ f"usemtl {material_name}",
+ obj_text,
+ flags=re.MULTILINE,
+ )
+ mtllib_line = f"mtllib {os.path.basename(mtl_path)}\n"
+ if f"usemtl {material_name}" not in obj_text:
+ obj_text = obj_text.replace(
+ mtllib_line, f"{mtllib_line}usemtl {material_name}\n", 1
+ )
+ with open(obj_path, "w", encoding="utf-8") as fh:
+ fh.write(obj_text)
+
+ # PBR textures. metallicRoughnessTexture: G=roughness, B=metallic.
+ base_arr = _texture_array(getattr(material, "baseColorTexture", None))
+ mr_arr = _texture_array(
+ getattr(material, "metallicRoughnessTexture", None)
+ )
+ normal_arr = _texture_array(getattr(material, "normalTexture", None))
+ files = {
+ "baseColor": f"{asset_name}_baseColor.png",
+ "metallic": f"{asset_name}_metallic.png",
+ "roughness": f"{asset_name}_roughness.png",
+ "normal": f"{asset_name}_normal.png",
+ }
+ metallic_arr = (
+ np.stack([mr_arr[:, :, 2]] * 3, axis=-1)
+ if mr_arr is not None
+ else None
+ )
+ roughness_arr = (
+ np.stack([mr_arr[:, :, 1]] * 3, axis=-1)
+ if mr_arr is not None
+ else None
+ )
+ saved = {
+ "baseColor": _save_rgb(
+ base_arr,
+ os.path.join(output_dir, files["baseColor"]),
+ texture_size,
+ ),
+ "metallic": _save_rgb(
+ metallic_arr,
+ os.path.join(output_dir, files["metallic"]),
+ texture_size,
+ ),
+ "roughness": _save_rgb(
+ roughness_arr,
+ os.path.join(output_dir, files["roughness"]),
+ texture_size,
+ ),
+ "normal": _save_rgb(
+ normal_arr,
+ os.path.join(output_dir, files["normal"]),
+ texture_size,
+ ),
+ }
+
+ def _factor(attr: str, default: float = 1.0) -> float:
+ v = getattr(material, attr, default)
+ return default if v is None else float(v)
+
+ bc = getattr(material, "baseColorFactor", None)
+ if bc is None:
+ base_factor = [1.0, 1.0, 1.0, 1.0]
+ else:
+ arr = np.asarray(bc, dtype=float).reshape(-1)
+ if arr.max(initial=1.0) > 1.0:
+ arr = arr / 255.0
+ base_factor = [float(arr[0]), float(arr[1]), float(arr[2])]
+ base_factor.append(float(arr[3]) if len(arr) >= 4 else 1.0)
+ metallic_factor = _factor("metallicFactor", 1.0)
+ roughness_factor = _factor("roughnessFactor", 1.0)
+
+ ns = max(1.0, min(1000.0, (1.0 - roughness_factor) * 1000.0))
+ lines = [
+ "# Exported from Hunyuan3D Pro GLB",
+ "# PBR note: glTF metallicRoughnessTexture stores roughness in G "
+ "and metallic in B.",
+ f"newmtl {material_name}",
+ f"Ka {base_factor[0]:.8g} {base_factor[1]:.8g} {base_factor[2]:.8g}",
+ f"Kd {base_factor[0]:.8g} {base_factor[1]:.8g} {base_factor[2]:.8g}",
+ "Ks 0 0 0",
+ f"Ns {ns:.8g}",
+ f"d {base_factor[3]:.8g}",
+ "illum 2",
+ f"Pm {metallic_factor:.8g}",
+ f"Pr {roughness_factor:.8g}",
+ ]
+ if saved["baseColor"]:
+ lines.append(f"map_Kd {files['baseColor']}")
+ if saved["normal"]:
+ lines.append(f"norm {files['normal']}")
+ lines.append(f"bump {files['normal']}")
+ if saved["metallic"]:
+ lines.append(f"map_Pm {files['metallic']}")
+ if saved["roughness"]:
+ lines.append(f"map_Pr {files['roughness']}")
+ with open(mtl_path, "w", encoding="utf-8") as fh:
+ fh.write("\n".join(lines) + "\n")
+
+ metadata = {
+ "source": glb_path,
+ "obj": os.path.basename(obj_path),
+ "mtl": os.path.basename(mtl_path),
+ "material": material_name,
+ "geometry": geom_name,
+ "alignment": "recenter_to_bbox_origin",
+ "bakedTransform": np.asarray(baked_xform).tolist(),
+ "sourceSceneBounds": np.asarray(scene.bounds).tolist(),
+ "exportedObjBounds": np.asarray(mesh.bounds).tolist(),
+ "baseColorFactor": base_factor,
+ "metallicFactor": metallic_factor,
+ "roughnessFactor": roughness_factor,
+ "textureMaxEdge": texture_size,
+ "textures": {k: (files[k] if saved[k] else None) for k in files},
+ }
+ with open(json_path, "w", encoding="utf-8") as fh:
+ fh.write(json.dumps(metadata, indent=2) + "\n")
+
+ return obj_path
+
+
+def _ship_scaled_pbr_artefacts(
+ aligned_glb: str,
+ urdf_path: str,
+ output_root: str,
+ final_mesh_dir: str,
+ asset_name: str,
+) -> None:
+ """Write scaled OBJ companions + GLB with full PBR into ``final_mesh_dir``.
+
+ URDFGen's trimesh roundtrip drops Hunyuan's metallic/roughness/normal
+ maps; we restore PBR fidelity by:
+
+ 1. Inferring the scale factor URDFGen applied by comparing the
+ scaled OBJ's extent with the aligned source GLB's extent (the
+ URDF ```` element stores ``real_height`` instead, which is
+ a different quantity).
+ 2. Loading the aligned full-PBR source GLB, scaling it, and writing
+ it next to URDFGen's OBJ so both share the same scale.
+ 3. Copying the 4 PBR PNGs from ``output_root`` into the mesh dir.
+ 4. Patching URDFGen's ``material.mtl`` so Phong rendering has a
+ visible specular highlight and PBR-aware OBJ importers pick up
+ ``map_Pm`` / ``map_Pr`` / ``norm`` / ``bump`` references.
+ """
+ # Derive the actual scale factor from URDFGen's scaled OBJ rather than
+ # the URDF ```` element (which stores real_height — a midpoint
+ # value distinct from the geometric scaling factor URDFGen applied).
+ urdfgen_obj = trimesh.load(
+ os.path.join(final_mesh_dir, f"{asset_name}.obj"),
+ force="mesh",
+ process=False,
+ )
+ target_max = float(urdfgen_obj.extents.max())
+
+ scene = trimesh.load(aligned_glb, force="scene", process=False)
+ mesh, _, _ = _bake_scene_transform(scene)
+ V = np.asarray(mesh.vertices, dtype=np.float32)
+ src_max = float((V.max(axis=0) - V.min(axis=0)).max())
+ scale = target_max / src_max if src_max > 1e-9 else 1.0
+ mesh.vertices = V * scale
+ mesh.export(os.path.join(final_mesh_dir, f"{asset_name}.glb"))
+
+ pbr_pngs = {
+ "metallic": f"{asset_name}_metallic.png",
+ "roughness": f"{asset_name}_roughness.png",
+ "normal": f"{asset_name}_normal.png",
+ }
+ base_color_png = f"{asset_name}_baseColor.png"
+ pbr_json = f"{asset_name}_pbr_material.json"
+ for fname in (
+ base_color_png,
+ pbr_pngs["metallic"],
+ pbr_pngs["roughness"],
+ pbr_pngs["normal"],
+ pbr_json,
+ ):
+ src = os.path.join(output_root, fname)
+ if os.path.exists(src):
+ copy(src, os.path.join(final_mesh_dir, fname))
+
+ mtl_path = os.path.join(final_mesh_dir, "material.mtl")
+ if os.path.exists(mtl_path):
+ with open(mtl_path) as fh:
+ mtl_text = fh.read()
+ # trimesh's OBJ exporter writes ``Ks 0 0 0`` + ``Ns 1`` which makes
+ # Blender's OBJ Phong path render the surface as flat matte. Bump
+ # specular and shininess so the OBJ has visible highlights matching
+ # the PBR GLB, then append the PBR texture map references that
+ # PBR-aware OBJ importers (Blender 3.6+, others) will pick up.
+ mtl_text = re.sub(
+ r"^Ks\s.+$", "Ks 0.5 0.5 0.5", mtl_text, flags=re.MULTILINE
+ )
+ mtl_text = re.sub(r"^Ns\s.+$", "Ns 250", mtl_text, flags=re.MULTILINE)
+ if not re.search(r"^illum\s", mtl_text, re.MULTILINE):
+ mtl_text = mtl_text.rstrip() + "\nillum 2\n"
+ extras = []
+ if os.path.exists(os.path.join(final_mesh_dir, pbr_pngs["metallic"])):
+ extras.append(f"map_Pm {pbr_pngs['metallic']}")
+ if os.path.exists(os.path.join(final_mesh_dir, pbr_pngs["roughness"])):
+ extras.append(f"map_Pr {pbr_pngs['roughness']}")
+ if os.path.exists(os.path.join(final_mesh_dir, pbr_pngs["normal"])):
+ extras.append(f"norm {pbr_pngs['normal']}")
+ extras.append(f"bump {pbr_pngs['normal']}")
+ if extras and not any(
+ line in mtl_text for line in ("map_Pm", "map_Pr", "norm ")
+ ):
+ mtl_text = mtl_text.rstrip() + "\n" + "\n".join(extras) + "\n"
+ with open(mtl_path, "w") as fh:
+ fh.write(mtl_text)
+
+
+def _build_asset_attrs(args, idx: int) -> dict:
+ """Build the URDF asset_attrs dict from CLI args."""
+ attrs = {"version": args.version or VERSION}
+ if args.height_range:
+ lo, hi = map(float, args.height_range.split("-"))
+ attrs["min_height"], attrs["max_height"] = lo, hi
+ if args.mass_range:
+ lo, hi = map(float, args.mass_range.split("-"))
+ attrs["min_mass"], attrs["max_mass"] = lo, hi
+ if isinstance(args.asset_type, list) and args.asset_type[idx]:
+ attrs["category"] = args.asset_type[idx]
+ return attrs
+
+
+def _render_color_video(
+ obj_path: str, work_dir: str, filename: str
+) -> Optional[str]:
+ """Render a turntable color mp4 via the shared kaolin renderer.
+
+ Returns the produced mp4 path, or ``None`` on failure (caller logs).
+ """
+ try:
+ # differentiable_render hardcodes mp4 fps=15; 90 frames -> 6s,
+ # matching SAM3D/TRELLIS gs_mesh.mp4 duration.
+ render_pbr_video(
+ mesh_path=obj_path,
+ output_root=work_dir,
+ uuid=[filename],
+ num_images=90,
+ elevation=[20.0],
+ distance=5.0,
+ fov=30.0,
+ with_mtl=True,
+ gen_color_mp4=True,
+ no_index_file=True,
+ )
+ mp4 = os.path.join(work_dir, filename, "color.mp4")
+ return mp4 if os.path.exists(mp4) else None
+ except Exception as exc: # pragma: no cover - rendering is optional
+ logger.warning(f"HUNYUAN3D video render failed: {exc}")
+ return None
+
+
+def _process_glb(
+ args,
+ idx: int,
+ output_root: str,
+ filename: str,
+ cfg: HunyuanConfig,
+ checkers: list,
+ log_label: str,
+ seg_input_pair: Optional[Tuple[str, str]] = None,
+ pre_align_rotation: Optional[np.ndarray] = None,
+) -> str:
+ """GLB-to-result post-processing shared by image and text paths.
+
+ Expects an aligned full-PBR GLB at ``{output_root}/{filename}.glb``.
+ Runs ``export_glb_to_obj`` → video render → URDFGen → PBR fidelity
+ fixup → single-arg quality checks (skipped when ``checkers`` is empty)
+ → ``result/`` organization. ``seg_input_pair`` lets the image path
+ feed raw/cond images to ``ImageSegChecker``; text path passes ``None``.
+ ``pre_align_rotation`` (3x3) is folded into the single mesh transform
+ inside ``export_glb_to_obj``, avoiding a separate load/export pass.
+ Returns the result dir path.
+ """
+ export_glb_to_obj(
+ glb_path=os.path.join(output_root, f"{filename}.glb"),
+ output_dir=output_root,
+ asset_name=filename,
+ texture_size=cfg.texture_size,
+ pre_align_rotation=pre_align_rotation,
+ )
+ mesh_obj_path = os.path.join(output_root, f"{filename}.obj")
+
+ video_path = _render_color_video(
+ mesh_obj_path, os.path.join(output_root, "_video"), filename
+ )
+
+ urdf_convertor = URDFGenerator(
+ GPT_CLIENT,
+ render_view_num=4,
+ decompose_convex=not args.disable_decompose_convex,
+ )
+ urdf_root = f"{output_root}/URDF_{filename}"
+ urdf_path = urdf_convertor(
+ mesh_path=mesh_obj_path,
+ output_root=urdf_root,
+ **_build_asset_attrs(args, idx),
+ )
+
+ # Final mesh dir: keep URDFGen's scaled OBJ + collision, restore full
+ # PBR fidelity that URDFGen's simple trimesh roundtrip strips (rescaled
+ # source GLB + PBR map refs appended to material.mtl).
+ final_mesh_dir = f"{urdf_root}/{urdf_convertor.output_mesh_dir}"
+ _ship_scaled_pbr_artefacts(
+ aligned_glb=os.path.join(output_root, f"{filename}.glb"),
+ urdf_path=urdf_path,
+ output_root=output_root,
+ final_mesh_dir=final_mesh_dir,
+ asset_name=filename,
+ )
+
+ # Quality checks: only the single-arg (BaseChecker.validate) ones go
+ # here. Two-arg checkers like TextGenAlignChecker run in the caller.
+ if checkers:
+ render_image_paths = glob(
+ f"{urdf_root}/{urdf_convertor.output_render_dir}/image_color/*.png"
+ )
+ images_list = []
+ for ch in checkers:
+ if isinstance(ch, ImageSegChecker) and seg_input_pair is not None:
+ images_list.append(list(seg_input_pair))
+ else:
+ images_list.append(combine_images_to_grid(render_image_paths))
+ qa_results = BaseChecker.validate(checkers, images_list)
+ urdf_convertor.add_quality_tag(urdf_path, qa_results)
+
+ # Organize result/ (no gs.ply; video.mp4 included when render OK).
+ result_dir = f"{output_root}/result"
+ if os.path.exists(result_dir):
+ rmtree(result_dir, ignore_errors=True)
+ os.makedirs(result_dir, exist_ok=True)
+ copy(urdf_path, f"{result_dir}/{os.path.basename(urdf_path)}")
+ copytree(
+ f"{urdf_root}/{urdf_convertor.output_mesh_dir}",
+ f"{result_dir}/{urdf_convertor.output_mesh_dir}",
+ )
+ if video_path and os.path.exists(video_path):
+ copy(video_path, f"{result_dir}/video.mp4")
+
+ if not args.keep_intermediate:
+ delete_dir(output_root, keep_subs=["result"])
+
+ logger.info(f"Saved results for {log_label} in {result_dir}")
+ return result_dir
+
+
+# Rotation that aligns a Hunyuan3D **text**-to-3D GLB with the **image**-to-3D
+# frame. -90° around the file-coord up axis (Y), i.e. x' = z, z' = -x.
+# Applied as a single multiplication inside ``export_glb_to_obj`` so the
+# text path does not need a separate GLB load/save pass.
+TEXT_TO_IMAGE_FRAME_ROTATION = np.array(
+ [[0.0, 0.0, -1.0], [0.0, 1.0, 0.0], [1.0, 0.0, 0.0]],
+ dtype=np.float32,
+)
+
+
+def _acquire_or_reuse_glb(
+ output_root: str,
+ filename: str,
+ cfg: HunyuanConfig,
+ hunyuan_credentials: Optional[Tuple[str, str]],
+ *,
+ image_path: Optional[str] = None,
+ prompt: Optional[str] = None,
+) -> None:
+ """Ensure ``{output_root}/{filename}.glb`` exists.
+
+ Reuses an existing GLB at that path (dev fixture short-circuit) or
+ calls :func:`acquire_pro_glb` with ``image_path`` or ``prompt``.
+ """
+ glb_path = os.path.join(output_root, f"{filename}.glb")
+ if os.path.exists(glb_path):
+ logger.info(
+ "HUNYUAN3D reusing existing GLB at %s; skipping Tencent API call.",
+ glb_path,
+ )
+ return
+ creds = hunyuan_credentials or load_credentials()
+ acquire_pro_glb(
+ image_path=image_path,
+ prompt=prompt,
+ output_dir=output_root,
+ asset_name=filename,
+ credentials=creds,
+ cfg=cfg,
+ )
+
+
+def process_image(
+ args,
+ idx: int,
+ image_path: str,
+ output_root: str,
+ filename: str,
+ hunyuan_config: Optional[HunyuanConfig],
+ hunyuan_credentials: Optional[Tuple[str, str]],
+ checkers: list,
+) -> None:
+ """HUNYUAN3D image-to-3D entry: image → GLB → export → URDF → result/."""
+ cfg = hunyuan_config or HunyuanConfig()
+ _acquire_or_reuse_glb(
+ output_root, filename, cfg, hunyuan_credentials, image_path=image_path
+ )
+ _process_glb(
+ args=args,
+ idx=idx,
+ output_root=output_root,
+ filename=filename,
+ cfg=cfg,
+ checkers=checkers,
+ log_label=image_path,
+ seg_input_pair=(
+ f"{output_root}/{filename}_raw.png",
+ f"{output_root}/{filename}_cond.png",
+ ),
+ )
+
+
+def process_prompt(
+ args,
+ idx: int,
+ prompt: str,
+ output_root: str,
+ filename: str,
+ hunyuan_config: Optional[HunyuanConfig],
+ hunyuan_credentials: Optional[Tuple[str, str]],
+ checkers: list,
+) -> None:
+ """HUNYUAN3D text-to-3D entry: prompt → GLB → export → URDF → result/.
+
+ Text path skips ``text-to-image`` entirely; ``checkers`` should only
+ contain single-arg (``BaseChecker.validate``-compatible) checkers.
+ Two-arg checkers like ``TextGenAlignChecker`` should be invoked by
+ the caller after this returns.
+ """
+ cfg = hunyuan_config or HunyuanConfig()
+ _acquire_or_reuse_glb(
+ output_root, filename, cfg, hunyuan_credentials, prompt=prompt
+ )
+ # Text endpoint sits 90° offset around the up axis vs the image
+ # endpoint; fold the alignment rotation into export_glb_to_obj's
+ # single mesh-transform pass to avoid a separate GLB roundtrip.
+ _process_glb(
+ args=args,
+ idx=idx,
+ output_root=output_root,
+ filename=filename,
+ cfg=cfg,
+ checkers=checkers,
+ log_label=f"prompt={prompt!r}",
+ seg_input_pair=None,
+ pre_align_rotation=TEXT_TO_IMAGE_FRAME_ROTATION,
+ )
diff --git a/embodied_gen/scripts/imageto3d.py b/embodied_gen/scripts/imageto3d.py
index 13a1193..07aa76b 100644
--- a/embodied_gen/scripts/imageto3d.py
+++ b/embodied_gen/scripts/imageto3d.py
@@ -29,6 +29,11 @@
# from embodied_gen.models.sr_model import ImageRealESRGAN
# from embodied_gen.models.delight_model import DelightingModel
from embodied_gen.models.gs_model import GaussianOperator
+from embodied_gen.models.hunyuan3d import (
+ HunyuanConfig,
+ load_credentials,
+ process_image,
+)
from embodied_gen.models.segment_model import RembgRemover
from embodied_gen.scripts.render_gs import entrypoint as render_gs_api
from embodied_gen.utils.gpt_clients import GPT_CLIENT
@@ -49,19 +54,47 @@
from embodied_gen.validators.urdf_convertor import URDFGenerator
# random.seed(0)
-IMAGE3D_MODEL = "SAM3D" # TRELLIS or SAM3D
-logger.info(f"Loading {IMAGE3D_MODEL} as Image3D Models...")
-if IMAGE3D_MODEL == "TRELLIS":
- from thirdparty.TRELLIS.trellis.pipelines import TrellisImageTo3DPipeline
-
- PIPELINE = TrellisImageTo3DPipeline.from_pretrained(
- "microsoft/TRELLIS-image-large"
- )
- # PIPELINE.cuda()
-elif IMAGE3D_MODEL == "SAM3D":
- from embodied_gen.models.sam3d import Sam3dInference
+IMAGE3D_MODEL = "SAM3D" # default backend; SAM3D, TRELLIS, or HUNYUAN3D
+SUPPORTED_IMAGE3D_MODELS = ("SAM3D", "TRELLIS", "HUNYUAN3D")
+
+
+_PIPELINE_CACHE: dict = {}
+
+
+def _build_image3d_pipeline(name: str):
+ """Lazily instantiate (and cache) the local image-to-3D pipeline.
+
+ The cache preserves the pre-refactor invariant that the local backend
+ is loaded once per process: ``textto3d.py`` calls ``entrypoint`` in a
+ per-node loop, and re-loading weights each call would regress runtime.
+ Returns ``None`` for backends that have no local model (HUNYUAN3D).
+ """
+ if name == "HUNYUAN3D":
+ return None
+ if name in _PIPELINE_CACHE:
+ return _PIPELINE_CACHE[name]
+ if name == "TRELLIS":
+ logger.info("Loading TRELLIS as Image3D Models...")
+ from thirdparty.TRELLIS.trellis.pipelines import (
+ TrellisImageTo3DPipeline,
+ )
+
+ pipeline = TrellisImageTo3DPipeline.from_pretrained(
+ "microsoft/TRELLIS-image-large"
+ )
+ elif name == "SAM3D":
+ logger.info("Loading SAM3D as Image3D Models...")
+ from embodied_gen.models.sam3d import Sam3dInference
+
+ pipeline = Sam3dInference()
+ else:
+ raise ValueError(
+ f"Unsupported image3d backend {name!r}; "
+ f"expected one of {SUPPORTED_IMAGE3D_MODELS}."
+ )
+ _PIPELINE_CACHE[name] = pipeline
+ return pipeline
- PIPELINE = Sam3dInference()
# DELIGHT = DelightingModel()
# IMAGESR_MODEL = ImageRealESRGAN(outscale=4)
@@ -109,6 +142,17 @@ def parse_args():
)
parser.add_argument("--disable_decompose_convex", action="store_true")
parser.add_argument("--texture_size", type=int, default=2048)
+ parser.add_argument(
+ "--image3d_model",
+ type=str,
+ default=IMAGE3D_MODEL,
+ help=(
+ "Image-to-3D backend. One of "
+ f"{', '.join(SUPPORTED_IMAGE3D_MODELS)} (case-insensitive). "
+ "HUNYUAN3D calls Tencent Hunyuan3D Pro API and requires "
+ "TENCENT_SECRET_ID/TENCENT_SECRET_KEY in the environment."
+ ),
+ )
args, unknown = parser.parse_known_args()
return args
@@ -120,6 +164,28 @@ def entrypoint(**kwargs):
if hasattr(args, k) and v is not None:
setattr(args, k, v)
+ args.image3d_model = str(args.image3d_model).strip().upper()
+ if args.image3d_model not in SUPPORTED_IMAGE3D_MODELS:
+ raise ValueError(
+ f"Unsupported --image3d_model {args.image3d_model!r}; "
+ f"expected one of {SUPPORTED_IMAGE3D_MODELS}."
+ )
+
+ hunyuan_config = None
+ hunyuan_credentials = None
+ if args.image3d_model == "HUNYUAN3D":
+ # Fail fast on missing creds before any local model load or network I/O.
+ hunyuan_credentials = load_credentials()
+ hunyuan_config = HunyuanConfig()
+ logger.info(
+ "HUNYUAN3D backend: action=%s host=%s result_format=%s",
+ hunyuan_config.image_action,
+ hunyuan_config.host,
+ hunyuan_config.result_format,
+ )
+
+ pipeline = _build_image3d_pipeline(args.image3d_model)
+
assert (
args.image_path or args.image_root
), "Please provide either --image_path or --image_root."
@@ -151,6 +217,19 @@ def entrypoint(**kwargs):
seg_image = RBG_REMOVER(image) if image.mode != "RGBA" else image
seg_image.save(seg_path)
+ if args.image3d_model == "HUNYUAN3D":
+ process_image(
+ args=args,
+ idx=idx,
+ image_path=image_path,
+ output_root=output_root,
+ filename=filename,
+ hunyuan_config=hunyuan_config,
+ hunyuan_credentials=hunyuan_credentials,
+ checkers=CHECKERS,
+ )
+ continue
+
seed = args.seed
asset_node = "unknown"
gs_model = None
@@ -161,7 +240,7 @@ def entrypoint(**kwargs):
f"Try: {try_idx + 1}/{args.n_retry}, Seed: {seed}, Prompt: {seg_path}"
)
try:
- outputs = image3d_model_infer(PIPELINE, seg_image, seed)
+ outputs = image3d_model_infer(pipeline, seg_image, seed)
except Exception as e:
logger.error(
f"[Image3D Failed] process {image_path}: {e}, retry: {try_idx+1}/{args.n_retry}"
diff --git a/embodied_gen/scripts/textto3d.py b/embodied_gen/scripts/textto3d.py
index c5fe5f3..1427cea 100644
--- a/embodied_gen/scripts/textto3d.py
+++ b/embodied_gen/scripts/textto3d.py
@@ -17,6 +17,7 @@
import argparse
import os
import random
+import types
from collections import defaultdict
import numpy as np
@@ -25,6 +26,10 @@
from embodied_gen.models.image_comm_model import build_hf_image_pipeline
from embodied_gen.models.segment_model import RembgRemover
from embodied_gen.models.text_model import PROMPT_APPEND
+from embodied_gen.scripts.imageto3d import (
+ IMAGE3D_MODEL,
+ SUPPORTED_IMAGE3D_MODELS,
+)
from embodied_gen.scripts.imageto3d import entrypoint as imageto3d_api
from embodied_gen.utils.gpt_clients import GPT_CLIENT
from embodied_gen.utils.log import logger
@@ -43,12 +48,32 @@
os.environ["TOKENIZERS_PARALLELISM"] = "false"
random.seed(0)
-logger.info("Loading TEXT2IMG_MODEL...")
-SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
-SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+# TXTGEN_CHECKER drives the final text↔3D quality gate for every backend
+# (SAM3D / TRELLIS / HUNYUAN3D), so it stays eager.
TXTGEN_CHECKER = TextGenAlignChecker(GPT_CLIENT)
-PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35"))
-BG_REMOVER = RembgRemover()
+
+# The text-to-image stack (PIPE_IMG, BG_REMOVER, SEMANTIC_CHECKER, SEG_CHECKER)
+# is only used on the SAM3D / TRELLIS path. HUNYUAN3D goes directly from prompt to 3D
+SEMANTIC_CHECKER = None
+SEG_CHECKER = None
+PIPE_IMG = None
+BG_REMOVER = None
+
+
+def _ensure_text2img_stack() -> None:
+ """Construct the text-to-image pipeline + image-stage checkers once.
+
+ Called from the SAM3D / TRELLIS path before any ``text_to_image`` run.
+ Idempotent: subsequent calls return immediately.
+ """
+ global SEMANTIC_CHECKER, SEG_CHECKER, PIPE_IMG, BG_REMOVER
+ if PIPE_IMG is not None:
+ return
+ logger.info("Loading TEXT2IMG_MODEL...")
+ SEMANTIC_CHECKER = SemanticConsistChecker(GPT_CLIENT)
+ SEG_CHECKER = ImageSegChecker(GPT_CLIENT)
+ PIPE_IMG = build_hf_image_pipeline(os.environ.get("TEXT_MODEL", "sd35"))
+ BG_REMOVER = RembgRemover()
__all__ = [
@@ -66,6 +91,7 @@ def text_to_image(
image_hw: tuple[int, int] = (1024, 1024),
seed: int = None,
) -> bool:
+ _ensure_text2img_stack()
select_image = None
success_flag = False
assert save_path.endswith(".png"), "Image save path must end with `.png`."
@@ -130,48 +156,114 @@ def text_to_3d(**kwargs) -> dict:
if hasattr(args, k) and v is not None:
setattr(args, k, v)
+ args.image3d_model = str(args.image3d_model).strip().upper()
+ if args.image3d_model not in SUPPORTED_IMAGE3D_MODELS:
+ raise ValueError(
+ f"Unsupported --image3d_model {args.image3d_model!r}; "
+ f"expected one of {SUPPORTED_IMAGE3D_MODELS}."
+ )
+
+ hunyuan_cfg = None
+ hunyuan_creds = None
+ process_prompt = None
+ if args.image3d_model == "HUNYUAN3D":
+ from embodied_gen.models.hunyuan3d import (
+ HunyuanConfig,
+ load_credentials,
+ process_prompt,
+ )
+
+ # Fail fast on missing creds before any network I/O.
+ hunyuan_creds = load_credentials()
+ hunyuan_cfg = HunyuanConfig()
+ logger.info(
+ "HUNYUAN3D text-to-3D backend: action=%s host=%s result_format=%s",
+ hunyuan_cfg.image_action,
+ hunyuan_cfg.host,
+ hunyuan_cfg.result_format,
+ )
+
if args.asset_names is None or len(args.asset_names) == 0:
args.asset_names = [f"sample3d_{i}" for i in range(len(args.prompts))]
- img_save_dir = os.path.join(args.output_root, "images")
asset_save_dir = os.path.join(args.output_root, "asset3d")
- os.makedirs(img_save_dir, exist_ok=True)
os.makedirs(asset_save_dir, exist_ok=True)
+ # HUNYUAN3D path skips text-to-image entirely; the images/ dir only
+ # exists when the local SAM3D / TRELLIS pipeline produces conditioning
+ # images.
+ img_save_dir = os.path.join(args.output_root, "images")
+ if args.image3d_model != "HUNYUAN3D":
+ os.makedirs(img_save_dir, exist_ok=True)
results = defaultdict(dict)
- for prompt, node in zip(args.prompts, args.asset_names):
+ for idx, (prompt, node) in enumerate(zip(args.prompts, args.asset_names)):
success_flag = False
n_pipe_retry = args.n_pipe_retry
seed_img = args.seed_img
seed_3d = args.seed_3d
+ # Tencent Pro API is charged per submit; force a single attempt to
+ # avoid silently multiplying cost when --n_pipe_retry > 1.
+ if args.image3d_model == "HUNYUAN3D" and n_pipe_retry > 1:
+ logger.warning(
+ "HUNYUAN3D mode: --n_pipe_retry forced to 1 (Tencent API "
+ "is charged per submit); user passed %d.",
+ n_pipe_retry,
+ )
+ n_pipe_retry = 1
while success_flag is False and n_pipe_retry > 0:
logger.info(
f"GEN pipeline for node {node}\n"
f"Try round: {args.n_pipe_retry-n_pipe_retry+1}/{args.n_pipe_retry}, Prompt: {prompt}"
)
- # Text-to-image GEN
save_node = node.replace(" ", "_")
- gen_image_path = f"{img_save_dir}/{save_node}.png"
- textgen_flag = text_to_image(
- prompt,
- gen_image_path,
- args.n_image_retry,
- args.img_denoise_step,
- args.text_guidance_scale,
- args.n_img_sample,
- seed=seed_img,
- )
-
- # Asset 3D GEN
node_save_dir = f"{asset_save_dir}/{save_node}"
asset_type = node if "sample3d_" not in node else None
- imageto3d_api(
- image_path=[gen_image_path],
- output_root=node_save_dir,
- asset_type=[asset_type],
- seed=random.randint(0, 100000) if seed_3d is None else seed_3d,
- n_retry=args.n_asset_retry,
- keep_intermediate=args.keep_intermediate,
- disable_decompose_convex=args.disable_decompose_convex,
- )
+
+ if args.image3d_model == "HUNYUAN3D":
+ hunyuan_args = types.SimpleNamespace(
+ asset_type=[asset_type],
+ version=None,
+ height_range=None,
+ mass_range=None,
+ disable_decompose_convex=args.disable_decompose_convex,
+ keep_intermediate=args.keep_intermediate,
+ )
+ process_prompt(
+ args=hunyuan_args,
+ idx=0,
+ prompt=prompt,
+ output_root=node_save_dir,
+ filename=save_node,
+ hunyuan_config=hunyuan_cfg,
+ hunyuan_credentials=hunyuan_creds,
+ checkers=[],
+ )
+ else:
+ # Text-to-image GEN (SAM3D / TRELLIS path).
+ gen_image_path = f"{img_save_dir}/{save_node}.png"
+ text_to_image(
+ prompt,
+ gen_image_path,
+ args.n_image_retry,
+ args.img_denoise_step,
+ args.text_guidance_scale,
+ args.n_img_sample,
+ seed=seed_img,
+ )
+
+ # Asset 3D GEN
+ imageto3d_api(
+ image_path=[gen_image_path],
+ output_root=node_save_dir,
+ asset_type=[asset_type],
+ seed=(
+ random.randint(0, 100000)
+ if seed_3d is None
+ else seed_3d
+ ),
+ n_retry=args.n_asset_retry,
+ keep_intermediate=args.keep_intermediate,
+ disable_decompose_convex=args.disable_decompose_convex,
+ image3d_model=args.image3d_model,
+ )
mesh_path = f"{node_save_dir}/result/mesh/{save_node}.obj"
image_path = render_asset3d(
mesh_path,
@@ -272,6 +364,18 @@ def parse_args():
)
parser.add_argument("--keep_intermediate", action="store_true")
parser.add_argument("--disable_decompose_convex", action="store_true")
+ parser.add_argument(
+ "--image3d_model",
+ type=str,
+ default=IMAGE3D_MODEL,
+ help=(
+ "Image-to-3D backend selector forwarded to imageto3d. One of "
+ f"{', '.join(SUPPORTED_IMAGE3D_MODELS)} (case-insensitive). "
+ "HUNYUAN3D skips the text-to-image stage entirely and calls "
+ "Tencent Hunyuan3D Pro text-to-3D directly; it requires "
+ "TENCENT_SECRET_ID/TENCENT_SECRET_KEY in the environment."
+ ),
+ )
args, unknown = parser.parse_known_args()
diff --git a/embodied_gen/validators/urdf_convertor.py b/embodied_gen/validators/urdf_convertor.py
index 6c770f6..f24417d 100644
--- a/embodied_gen/validators/urdf_convertor.py
+++ b/embodied_gen/validators/urdf_convertor.py
@@ -75,7 +75,7 @@
0.0
0.0
"-1"
- ""
+
@@ -132,9 +132,7 @@ def __init__(
view_desc = "This is the rendered views "
if prompt_template is None:
- prompt_template = (
- view_desc
- + """of the 3D object asset,
+ prompt_template = view_desc + """of the 3D object asset,
category: {category}.
You are an expert in 3D object analysis and physical property estimation.
Give the category of this object asset (within 3 words), (if category is
@@ -176,7 +174,6 @@ def __init__(
Assume the object is in real-world scale and estimate the approximate vertical height
based on the pose estimation and how large it appears vertically in the first image.
"""
- )
self.prompt_template = prompt_template
if attrs_name is None: