xizaoqu commited on
Commit
0cb2a53
·
1 Parent(s): 3b61a0b
Files changed (2) hide show
  1. app.py +18 -7
  2. configurations/huggingface.yaml +1 -1
app.py CHANGED
@@ -241,7 +241,7 @@ def set_memory_length(memory_length, sampling_memory_length_state):
241
 
242
  def generate(keys):
243
  # print("algo frame:", len(worldmem.frames))
244
- actions = parse_input_to_tensor(keys)
245
  global input_history
246
  global memory_frames
247
  global memory_curr_frame
@@ -251,8 +251,19 @@ def generate(keys):
251
  global self_memory_c2w
252
  global self_frame_idx
253
 
 
 
 
 
 
 
 
 
 
 
 
254
  new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
255
- actions,
256
  None,
257
  device=device,
258
  self_frames=self_frames,
@@ -422,12 +433,12 @@ with gr.Blocks(css=css) as demo:
422
  - D: turn right
423
  - Q: move forward
424
  - E: move backward
425
- - N: no-op (do nothing)
426
- - 1: switch to hotbar 1
427
  - U: use item
428
- 5. Higher denoising steps produce more detailed results but take longer. **20 steps** is a good balance between quality and speed. The same applies to context and memory length.
429
- 6. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
430
- 7. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **zeqixiao1@gmail.com**.
 
431
  """
432
  )
433
  # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
 
241
 
242
  def generate(keys):
243
  # print("algo frame:", len(worldmem.frames))
244
+ input_actions = parse_input_to_tensor(keys)
245
  global input_history
246
  global memory_frames
247
  global memory_curr_frame
 
251
  global self_memory_c2w
252
  global self_frame_idx
253
 
254
+ if self_frames is None:
255
+ new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
256
+ actions[0],
257
+ poses[0],
258
+ device=device,
259
+ self_frames=self_frames,
260
+ self_actions=self_actions,
261
+ self_poses=self_poses,
262
+ self_memory_c2w=self_memory_c2w,
263
+ self_frame_idx=self_frame_idx)
264
+
265
  new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
266
+ input_actions,
267
  None,
268
  device=device,
269
  self_frames=self_frames,
 
433
  - D: turn right
434
  - Q: move forward
435
  - E: move backward
436
+ - N: no-op (do nothing)
 
437
  - U: use item
438
+ 5. Higher denoising steps produce more detailed results but take longer. 20 steps is a good balance between quality and speed. The same applies to context and memory length.
439
+ 6. For faster performance, we recommend running the demo locally (~1s/frame on H100 vs ~5s on Spaces).
440
+ 7. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
441
+ 8. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **zeqixiao1@gmail.com**.
442
  """
443
  )
444
  # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])
configurations/huggingface.yaml CHANGED
@@ -1,4 +1,4 @@
1
- n_tokens: 8
2
  pose_cond_dim: 5
3
  use_plucker: true
4
  focal_length: 0.35
 
1
+ n_tokens: 3
2
  pose_cond_dim: 5
3
  use_plucker: true
4
  focal_length: 0.35