Add EMA smoothing support for joint positions

- Add smoothing_alpha parameter to model() API (default=1.0, no smoothing)
- Implement EMA smoothing in StreamJointRecovery263 class
- Support batch generation with independent smoothing per sample
- Update README with usage examples and API documentation
- Add .gitignore for cache and temporary files

Usage: model(text, length=60, output_joints=True, smoothing_alpha=0.5)

Files changed (4) hide show

.gitignore +54 -0
README.md +14 -4
hf_pipeline.py +22 -18
ldf_utils/motion_process.py +25 -1

.gitignore ADDED Viewed

	@@ -0,0 +1,54 @@

+# Python cache
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# Virtual environments
+venv/
+env/
+ENV/
+.venv
+# PyTorch/Model cache
+*.pth~
+*.safetensors~
+checkpoint/
+checkpoints/
+# Hugging Face cache
+.cache/
+huggingface_cache/
+# Generated outputs
+outputs/
+generated_motions/
+*.npy
+*.pkl
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Jupyter
+.ipynb_checkpoints/
+*.ipynb
+# Logs
+*.log
+logs/
+wandb/
+# Test outputs
+test_output/
+test_results/
+tmp/

README.md CHANGED Viewed

@@ -36,7 +36,7 @@ The model consists of three main components:
 - Input: Natural language text
 - Output: Motion sequences in two formats:
   - 263-dimensional HumanML3D features (default)
-  - 22×3 joint coordinates (optional)
 - Latent dimension: 4
 - Upsampling factor: 4× (VAE decoder)
 - Frame rate: 20 FPS
@@ -85,8 +85,8 @@ model = AutoModel.from_pretrained(
 motion = model("a person walking forward", length=60)
 print(f"Generated motion: {motion.shape}")  # (~240, 263)
-# Generate motion as joint coordinates (22 joints × 3 coords)
-motion_joints = model("a person walking forward", length=60, output_joints=True)
 print(f"Generated joints: {motion_joints.shape}")  # (~240, 22, 3)
 ```
@@ -123,7 +123,7 @@ print(f"Transition motion: {motion[0].shape}")
 ## API Reference
-### `model(text, length=60, text_end=None, num_denoise_steps=None, output_joints=False)`
 Generate motion sequences from text descriptions.
@@ -153,6 +153,12 @@ Generate motion sequences from text descriptions.
   - `False`: Returns 263-dimensional HumanML3D features
   - `True`: Returns 22×3 joint coordinates for direct visualization
 **Returns:**
 - Single motion:
   - `output_joints=False`: `numpy.ndarray` of shape `(frames, 263)`
@@ -178,6 +184,10 @@ motion = model(
 )  # Returns list with 1 array of shape (240, 263)
 ```
 ## Citation
 If you use this model in your research, please cite:

 - Input: Natural language text
 - Output: Motion sequences in two formats:
   - 263-dimensional HumanML3D features (default)
+  - 22×3 joint coordinates (optional, with EMA smoothing support)
 - Latent dimension: 4
 - Upsampling factor: 4× (VAE decoder)
 - Frame rate: 20 FPS
 motion = model("a person walking forward", length=60)
 print(f"Generated motion: {motion.shape}")  # (~240, 263)
+# Generate motion as joint coordinates (22 joints × 3 coords) with ema (alpha: 0.0-1.0)
+motion_joints = model("a person walking forward", length=60, output_joints=True, smoothing_alpha=0.5)
 print(f"Generated joints: {motion_joints.shape}")  # (~240, 22, 3)
 ```
 ## API Reference
+### `model(text, length=60, text_end=None, num_denoise_steps=None, output_joints=False, smoothing_alpha=1.0)`
 Generate motion sequences from text descriptions.
   - `False`: Returns 263-dimensional HumanML3D features
   - `True`: Returns 22×3 joint coordinates for direct visualization
+- **smoothing_alpha** (`float`, default=1.0): EMA smoothing factor for joint positions (only used when `output_joints=True`)
+  - `1.0`: No smoothing (default)
+  - `0.5`: Medium smoothing (recommended for smoother animations)
+  - `0.0`: Maximum smoothing
+  - Range: 0.0 to 1.0
 **Returns:**
 - Single motion:
   - `output_joints=False`: `numpy.ndarray` of shape `(frames, 263)`
 )  # Returns list with 1 array of shape (240, 263)
 ```
+## Update History
+- **2025/12/8**: Added EMA smoothing option for joint positions during rendering
 ## Citation
 If you use this model in your research, please cite:

hf_pipeline.py CHANGED Viewed

@@ -161,7 +161,8 @@ class LDFModel(PreTrainedModel):
         length: Union[int, List[int]] = 60,
         text_end: Optional[Union[List[int], List[List[int]]]] = None,
         num_denoise_steps: Optional[int] = None,
-        output_joints: bool = False
     ):
         """
         Generate motion sequences
@@ -175,6 +176,9 @@ class LDFModel(PreTrainedModel):
             text_end: Token positions for text switching
             num_denoise_steps: Number of denoising steps
             output_joints: If True, output 22×3 joint coordinates; if False (default), output 263-dim HumanML3D features
         Returns:
             numpy.ndarray or list of arrays
@@ -220,31 +224,31 @@ class LDFModel(PreTrainedModel):
         decoded_results = []
         joints_results = [] if output_joints else None
         for i, generated in enumerate(generated_batch):
             if generated is not None and torch.is_tensor(generated):
                 # Decode with VAE (following generate_ldf.py line 130)
                 decoded_g = self.vae.decode(generated[None, :])[0]
                 if output_joints:
-                    # Use the model_dir that was saved during _load_models
-                    model_dir = self.model_dir
-                    # Import convert_motion_to_joints from ldf_utils
-                    import importlib.util
-                    import numpy as np
-                    utils_spec = importlib.util.spec_from_file_location(
-                        "motion_process",
-                        os.path.join(model_dir, "ldf_utils", "motion_process.py")
-                    )
-                    motion_process_module = importlib.util.module_from_spec(utils_spec)
-                    utils_spec.loader.exec_module(motion_process_module)
-                    # Convert to joints using convert_motion_to_joints
                     decoded_np = decoded_g.cpu().numpy()
-                    joints = motion_process_module.convert_motion_to_joints(
-                        decoded_np, dim=263
                     )
                     joints_results.append(joints)
                 else:
                     decoded_results.append(decoded_g.cpu().numpy())

         length: Union[int, List[int]] = 60,
         text_end: Optional[Union[List[int], List[List[int]]]] = None,
         num_denoise_steps: Optional[int] = None,
+        output_joints: bool = False,
+        smoothing_alpha: float = 1.0
     ):
         """
         Generate motion sequences
             text_end: Token positions for text switching
             num_denoise_steps: Number of denoising steps
             output_joints: If True, output 22×3 joint coordinates; if False (default), output 263-dim HumanML3D features
+            smoothing_alpha: EMA smoothing factor for joint positions (0.0-1.0, default=1.0 no smoothing)
+                - Only used when output_joints=True
+                - Recommended: 0.5 for smoother animations
         Returns:
             numpy.ndarray or list of arrays
         decoded_results = []
         joints_results = [] if output_joints else None
+        # Import motion processing module once if needed
+        if output_joints:
+            import importlib.util
+            import numpy as np
+            utils_spec = importlib.util.spec_from_file_location(
+                "motion_process",
+                os.path.join(self.model_dir, "ldf_utils", "motion_process.py")
+            )
+            motion_process_module = importlib.util.module_from_spec(utils_spec)
+            utils_spec.loader.exec_module(motion_process_module)
         for i, generated in enumerate(generated_batch):
             if generated is not None and torch.is_tensor(generated):
                 # Decode with VAE (following generate_ldf.py line 130)
                 decoded_g = self.vae.decode(generated[None, :])[0]
                 if output_joints:
+                    # Convert to joints using StreamJointRecovery263 with smoothing
+                    # Create a new recovery instance for each sample to maintain independent state
                     decoded_np = decoded_g.cpu().numpy()
+                    recovery = motion_process_module.StreamJointRecovery263(
+                        joints_num=22, smoothing_alpha=smoothing_alpha
                     )
+                    joints = [recovery.process_frame(frame) for frame in decoded_np]
+                    joints = np.array(joints)
                     joints_results.append(joints)
                 else:
                     decoded_results.append(decoded_g.cpu().numpy())

ldf_utils/motion_process.py CHANGED Viewed

@@ -69,10 +69,19 @@ class StreamJointRecovery263:
     Key insight: The batch version uses PREVIOUS frame's velocity for the current frame,
     so we need to delay the velocity application by one frame.
     """
-    def __init__(self, joints_num: int):
         self.joints_num = joints_num
         self.reset()
     def reset(self):
@@ -82,6 +91,8 @@ class StreamJointRecovery263:
         # Store previous frame's velocities for delayed application
         self.prev_rot_vel = 0.0
         self.prev_linear_vel = np.array([0.0, 0.0])
     def process_frame(self, frame_data: np.ndarray) -> np.ndarray:
         """
@@ -145,6 +156,19 @@ class StreamJointRecovery263:
         # Convert to numpy
         joints_np = positions.detach().cpu().numpy()
         # Store current velocities for next frame
         self.prev_rot_vel = curr_rot_vel
         self.prev_linear_vel = curr_linear_vel

     Key insight: The batch version uses PREVIOUS frame's velocity for the current frame,
     so we need to delay the velocity application by one frame.
+    Args:
+        joints_num: Number of joints in the skeleton
+        smoothing_alpha: EMA smoothing factor (0.0 to 1.0)
+            - 1.0 = no smoothing (default), output follows input exactly
+            - 0.0 = infinite smoothing, output never changes
+            - Recommended values: 0.3-0.7 for visible smoothing
+            - Formula: smoothed = alpha * current + (1 - alpha) * previous
     """
+    def __init__(self, joints_num: int, smoothing_alpha: float = 1.0):
         self.joints_num = joints_num
+        self.smoothing_alpha = np.clip(smoothing_alpha, 0.0, 1.0)
         self.reset()
     def reset(self):
         # Store previous frame's velocities for delayed application
         self.prev_rot_vel = 0.0
         self.prev_linear_vel = np.array([0.0, 0.0])
+        # Store previous smoothed joints for EMA
+        self.prev_smoothed_joints = None
     def process_frame(self, frame_data: np.ndarray) -> np.ndarray:
         """
         # Convert to numpy
         joints_np = positions.detach().cpu().numpy()
+        # Apply EMA smoothing if enabled
+        if self.smoothing_alpha < 1.0:
+            if self.prev_smoothed_joints is None:
+                # First frame, no smoothing possible
+                self.prev_smoothed_joints = joints_np.copy()
+            else:
+                # EMA: smoothed = alpha * current + (1 - alpha) * previous
+                joints_np = (
+                    self.smoothing_alpha * joints_np
+                    + (1.0 - self.smoothing_alpha) * self.prev_smoothed_joints
+                )
+                self.prev_smoothed_joints = joints_np.copy()
         # Store current velocities for next frame
         self.prev_rot_vel = curr_rot_vel
         self.prev_linear_vel = curr_linear_vel