herrscher0 commited on
Commit
82d5f99
·
1 Parent(s): 3a98c3d

Add EMA smoothing support for joint positions

Browse files

- Add smoothing_alpha parameter to model() API (default=1.0, no smoothing)
- Implement EMA smoothing in StreamJointRecovery263 class
- Support batch generation with independent smoothing per sample
- Update README with usage examples and API documentation
- Add .gitignore for cache and temporary files

Usage: model(text, length=60, output_joints=True, smoothing_alpha=0.5)

Files changed (4) hide show
  1. .gitignore +54 -0
  2. README.md +14 -4
  3. hf_pipeline.py +22 -18
  4. ldf_utils/motion_process.py +25 -1
.gitignore ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python cache
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # Virtual environments
9
+ venv/
10
+ env/
11
+ ENV/
12
+ .venv
13
+
14
+ # PyTorch/Model cache
15
+ *.pth~
16
+ *.safetensors~
17
+ checkpoint/
18
+ checkpoints/
19
+
20
+ # Hugging Face cache
21
+ .cache/
22
+ huggingface_cache/
23
+
24
+ # Generated outputs
25
+ outputs/
26
+ generated_motions/
27
+ *.npy
28
+ *.pkl
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Jupyter
42
+ .ipynb_checkpoints/
43
+ *.ipynb
44
+
45
+ # Logs
46
+ *.log
47
+ logs/
48
+ wandb/
49
+
50
+ # Test outputs
51
+ test_output/
52
+ test_results/
53
+ tmp/
54
+
README.md CHANGED
@@ -36,7 +36,7 @@ The model consists of three main components:
36
  - Input: Natural language text
37
  - Output: Motion sequences in two formats:
38
  - 263-dimensional HumanML3D features (default)
39
- - 22×3 joint coordinates (optional)
40
  - Latent dimension: 4
41
  - Upsampling factor: 4× (VAE decoder)
42
  - Frame rate: 20 FPS
@@ -85,8 +85,8 @@ model = AutoModel.from_pretrained(
85
  motion = model("a person walking forward", length=60)
86
  print(f"Generated motion: {motion.shape}") # (~240, 263)
87
 
88
- # Generate motion as joint coordinates (22 joints × 3 coords)
89
- motion_joints = model("a person walking forward", length=60, output_joints=True)
90
  print(f"Generated joints: {motion_joints.shape}") # (~240, 22, 3)
91
  ```
92
 
@@ -123,7 +123,7 @@ print(f"Transition motion: {motion[0].shape}")
123
 
124
  ## API Reference
125
 
126
- ### `model(text, length=60, text_end=None, num_denoise_steps=None, output_joints=False)`
127
 
128
  Generate motion sequences from text descriptions.
129
 
@@ -153,6 +153,12 @@ Generate motion sequences from text descriptions.
153
  - `False`: Returns 263-dimensional HumanML3D features
154
  - `True`: Returns 22×3 joint coordinates for direct visualization
155
 
 
 
 
 
 
 
156
  **Returns:**
157
  - Single motion:
158
  - `output_joints=False`: `numpy.ndarray` of shape `(frames, 263)`
@@ -178,6 +184,10 @@ motion = model(
178
  ) # Returns list with 1 array of shape (240, 263)
179
  ```
180
 
 
 
 
 
181
  ## Citation
182
 
183
  If you use this model in your research, please cite:
 
36
  - Input: Natural language text
37
  - Output: Motion sequences in two formats:
38
  - 263-dimensional HumanML3D features (default)
39
+ - 22×3 joint coordinates (optional, with EMA smoothing support)
40
  - Latent dimension: 4
41
  - Upsampling factor: 4× (VAE decoder)
42
  - Frame rate: 20 FPS
 
85
  motion = model("a person walking forward", length=60)
86
  print(f"Generated motion: {motion.shape}") # (~240, 263)
87
 
88
+ # Generate motion as joint coordinates (22 joints × 3 coords) with ema (alpha: 0.0-1.0)
89
+ motion_joints = model("a person walking forward", length=60, output_joints=True, smoothing_alpha=0.5)
90
  print(f"Generated joints: {motion_joints.shape}") # (~240, 22, 3)
91
  ```
92
 
 
123
 
124
  ## API Reference
125
 
126
+ ### `model(text, length=60, text_end=None, num_denoise_steps=None, output_joints=False, smoothing_alpha=1.0)`
127
 
128
  Generate motion sequences from text descriptions.
129
 
 
153
  - `False`: Returns 263-dimensional HumanML3D features
154
  - `True`: Returns 22×3 joint coordinates for direct visualization
155
 
156
+ - **smoothing_alpha** (`float`, default=1.0): EMA smoothing factor for joint positions (only used when `output_joints=True`)
157
+ - `1.0`: No smoothing (default)
158
+ - `0.5`: Medium smoothing (recommended for smoother animations)
159
+ - `0.0`: Maximum smoothing
160
+ - Range: 0.0 to 1.0
161
+
162
  **Returns:**
163
  - Single motion:
164
  - `output_joints=False`: `numpy.ndarray` of shape `(frames, 263)`
 
184
  ) # Returns list with 1 array of shape (240, 263)
185
  ```
186
 
187
+ ## Update History
188
+
189
+ - **2025/12/8**: Added EMA smoothing option for joint positions during rendering
190
+
191
  ## Citation
192
 
193
  If you use this model in your research, please cite:
hf_pipeline.py CHANGED
@@ -161,7 +161,8 @@ class LDFModel(PreTrainedModel):
161
  length: Union[int, List[int]] = 60,
162
  text_end: Optional[Union[List[int], List[List[int]]]] = None,
163
  num_denoise_steps: Optional[int] = None,
164
- output_joints: bool = False
 
165
  ):
166
  """
167
  Generate motion sequences
@@ -175,6 +176,9 @@ class LDFModel(PreTrainedModel):
175
  text_end: Token positions for text switching
176
  num_denoise_steps: Number of denoising steps
177
  output_joints: If True, output 22×3 joint coordinates; if False (default), output 263-dim HumanML3D features
 
 
 
178
 
179
  Returns:
180
  numpy.ndarray or list of arrays
@@ -220,31 +224,31 @@ class LDFModel(PreTrainedModel):
220
  decoded_results = []
221
  joints_results = [] if output_joints else None
222
 
 
 
 
 
 
 
 
 
 
 
 
223
  for i, generated in enumerate(generated_batch):
224
  if generated is not None and torch.is_tensor(generated):
225
  # Decode with VAE (following generate_ldf.py line 130)
226
  decoded_g = self.vae.decode(generated[None, :])[0]
227
 
228
  if output_joints:
229
- # Use the model_dir that was saved during _load_models
230
- model_dir = self.model_dir
231
-
232
- # Import convert_motion_to_joints from ldf_utils
233
- import importlib.util
234
- import numpy as np
235
- utils_spec = importlib.util.spec_from_file_location(
236
- "motion_process",
237
- os.path.join(model_dir, "ldf_utils", "motion_process.py")
238
- )
239
- motion_process_module = importlib.util.module_from_spec(utils_spec)
240
- utils_spec.loader.exec_module(motion_process_module)
241
-
242
- # Convert to joints using convert_motion_to_joints
243
  decoded_np = decoded_g.cpu().numpy()
244
-
245
- joints = motion_process_module.convert_motion_to_joints(
246
- decoded_np, dim=263
247
  )
 
 
248
  joints_results.append(joints)
249
  else:
250
  decoded_results.append(decoded_g.cpu().numpy())
 
161
  length: Union[int, List[int]] = 60,
162
  text_end: Optional[Union[List[int], List[List[int]]]] = None,
163
  num_denoise_steps: Optional[int] = None,
164
+ output_joints: bool = False,
165
+ smoothing_alpha: float = 1.0
166
  ):
167
  """
168
  Generate motion sequences
 
176
  text_end: Token positions for text switching
177
  num_denoise_steps: Number of denoising steps
178
  output_joints: If True, output 22×3 joint coordinates; if False (default), output 263-dim HumanML3D features
179
+ smoothing_alpha: EMA smoothing factor for joint positions (0.0-1.0, default=1.0 no smoothing)
180
+ - Only used when output_joints=True
181
+ - Recommended: 0.5 for smoother animations
182
 
183
  Returns:
184
  numpy.ndarray or list of arrays
 
224
  decoded_results = []
225
  joints_results = [] if output_joints else None
226
 
227
+ # Import motion processing module once if needed
228
+ if output_joints:
229
+ import importlib.util
230
+ import numpy as np
231
+ utils_spec = importlib.util.spec_from_file_location(
232
+ "motion_process",
233
+ os.path.join(self.model_dir, "ldf_utils", "motion_process.py")
234
+ )
235
+ motion_process_module = importlib.util.module_from_spec(utils_spec)
236
+ utils_spec.loader.exec_module(motion_process_module)
237
+
238
  for i, generated in enumerate(generated_batch):
239
  if generated is not None and torch.is_tensor(generated):
240
  # Decode with VAE (following generate_ldf.py line 130)
241
  decoded_g = self.vae.decode(generated[None, :])[0]
242
 
243
  if output_joints:
244
+ # Convert to joints using StreamJointRecovery263 with smoothing
245
+ # Create a new recovery instance for each sample to maintain independent state
 
 
 
 
 
 
 
 
 
 
 
 
246
  decoded_np = decoded_g.cpu().numpy()
247
+ recovery = motion_process_module.StreamJointRecovery263(
248
+ joints_num=22, smoothing_alpha=smoothing_alpha
 
249
  )
250
+ joints = [recovery.process_frame(frame) for frame in decoded_np]
251
+ joints = np.array(joints)
252
  joints_results.append(joints)
253
  else:
254
  decoded_results.append(decoded_g.cpu().numpy())
ldf_utils/motion_process.py CHANGED
@@ -69,10 +69,19 @@ class StreamJointRecovery263:
69
 
70
  Key insight: The batch version uses PREVIOUS frame's velocity for the current frame,
71
  so we need to delay the velocity application by one frame.
 
 
 
 
 
 
 
 
72
  """
73
 
74
- def __init__(self, joints_num: int):
75
  self.joints_num = joints_num
 
76
  self.reset()
77
 
78
  def reset(self):
@@ -82,6 +91,8 @@ class StreamJointRecovery263:
82
  # Store previous frame's velocities for delayed application
83
  self.prev_rot_vel = 0.0
84
  self.prev_linear_vel = np.array([0.0, 0.0])
 
 
85
 
86
  def process_frame(self, frame_data: np.ndarray) -> np.ndarray:
87
  """
@@ -145,6 +156,19 @@ class StreamJointRecovery263:
145
  # Convert to numpy
146
  joints_np = positions.detach().cpu().numpy()
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  # Store current velocities for next frame
149
  self.prev_rot_vel = curr_rot_vel
150
  self.prev_linear_vel = curr_linear_vel
 
69
 
70
  Key insight: The batch version uses PREVIOUS frame's velocity for the current frame,
71
  so we need to delay the velocity application by one frame.
72
+
73
+ Args:
74
+ joints_num: Number of joints in the skeleton
75
+ smoothing_alpha: EMA smoothing factor (0.0 to 1.0)
76
+ - 1.0 = no smoothing (default), output follows input exactly
77
+ - 0.0 = infinite smoothing, output never changes
78
+ - Recommended values: 0.3-0.7 for visible smoothing
79
+ - Formula: smoothed = alpha * current + (1 - alpha) * previous
80
  """
81
 
82
+ def __init__(self, joints_num: int, smoothing_alpha: float = 1.0):
83
  self.joints_num = joints_num
84
+ self.smoothing_alpha = np.clip(smoothing_alpha, 0.0, 1.0)
85
  self.reset()
86
 
87
  def reset(self):
 
91
  # Store previous frame's velocities for delayed application
92
  self.prev_rot_vel = 0.0
93
  self.prev_linear_vel = np.array([0.0, 0.0])
94
+ # Store previous smoothed joints for EMA
95
+ self.prev_smoothed_joints = None
96
 
97
  def process_frame(self, frame_data: np.ndarray) -> np.ndarray:
98
  """
 
156
  # Convert to numpy
157
  joints_np = positions.detach().cpu().numpy()
158
 
159
+ # Apply EMA smoothing if enabled
160
+ if self.smoothing_alpha < 1.0:
161
+ if self.prev_smoothed_joints is None:
162
+ # First frame, no smoothing possible
163
+ self.prev_smoothed_joints = joints_np.copy()
164
+ else:
165
+ # EMA: smoothed = alpha * current + (1 - alpha) * previous
166
+ joints_np = (
167
+ self.smoothing_alpha * joints_np
168
+ + (1.0 - self.smoothing_alpha) * self.prev_smoothed_joints
169
+ )
170
+ self.prev_smoothed_joints = joints_np.copy()
171
+
172
  # Store current velocities for next frame
173
  self.prev_rot_vel = curr_rot_vel
174
  self.prev_linear_vel = curr_linear_vel