Spaces:

yslan
/

worldmem

Running on Zero

App Files Files Community

xizaoqu commited on Apr 10

Commit

0cb2a53

1 Parent(s): 3b61a0b

update

Browse files

Files changed (2) hide show

app.py +18 -7
configurations/huggingface.yaml +1 -1

app.py CHANGED Viewed

@@ -241,7 +241,7 @@ def set_memory_length(memory_length, sampling_memory_length_state):
 def generate(keys):
     # print("algo frame:", len(worldmem.frames))
-    actions = parse_input_to_tensor(keys)
     global input_history
     global memory_frames
     global memory_curr_frame
@@ -251,8 +251,19 @@ def generate(keys):
     global self_memory_c2w
     global self_frame_idx
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
-                                    actions,
                                     None,
                                     device=device,
                                     self_frames=self_frames,
@@ -422,12 +433,12 @@ with gr.Blocks(css=css) as demo:
         - D: turn right
         - Q: move forward
         - E: move backward
-        - N: no-op (do nothing)
-        - 1: switch to hotbar 1
         - U: use item
-        5. Higher denoising steps produce more detailed results but take longer. **20 steps** is a good balance between quality and speed. The same applies to context and memory length.
-        6. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
-        7. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **zeqixiao1@gmail.com**.
         """
     )
     # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])

 def generate(keys):
     # print("algo frame:", len(worldmem.frames))
+    input_actions = parse_input_to_tensor(keys)
     global input_history
     global memory_frames
     global memory_curr_frame
     global self_memory_c2w
     global self_frame_idx
+    if self_frames is None:
+        new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
+                                    actions[0],
+                                    poses[0],
+                                    device=device,
+                                    self_frames=self_frames,
+                                    self_actions=self_actions,
+                                    self_poses=self_poses,
+                                    self_memory_c2w=self_memory_c2w,
+                                    self_frame_idx=self_frame_idx)
     new_frame, self_frames, self_actions, self_poses, self_memory_c2w, self_frame_idx = run_interactive(memory_frames[0],
+                                    input_actions,
                                     None,
                                     device=device,
                                     self_frames=self_frames,
         - D: turn right
         - Q: move forward
         - E: move backward
+        - N: no-op (do nothing)
         - U: use item
+        5. Higher denoising steps produce more detailed results but take longer. 20 steps is a good balance between quality and speed. The same applies to context and memory length.
+        6. For faster performance, we recommend running the demo locally (~1s/frame on H100 vs ~5s on Spaces).
+        7. If you find this project interesting or useful, please consider giving it a ⭐️ on [GitHub]()!
+        8. For feedback or suggestions, feel free to open a GitHub issue or contact me directly at **zeqixiao1@gmail.com**.
         """
     )
     # input_box.submit(update_image_and_log, inputs=[input_box], outputs=[image_display, video_display, log_output])

configurations/huggingface.yaml CHANGED Viewed

@@ -1,4 +1,4 @@
-n_tokens: 8
 pose_cond_dim: 5
 use_plucker: true
 focal_length: 0.35

+n_tokens: 3
 pose_cond_dim: 5
 use_plucker: true
 focal_length: 0.35