Spaces:

Yanrui95
/

NormalCrafter

Runtime error

App Files Files Community

Yanrui95 commited on Apr 8

Commit

fc13e66

verified ·

1 Parent(s): 3f350cf

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

.gitattributes +2 -0
.gitignore +175 -0
LICENSE +32 -0
README.md +60 -9
app.py +210 -0
examples/example_01.mp4 +3 -0
examples/example_02.mp4 +3 -0
examples/example_03.mp4 +3 -0
examples/example_04.mp4 +3 -0
examples/example_05.mp4 +3 -0
examples/example_06.mp4 +3 -0
normalcrafter/__init__.py +0 -0
normalcrafter/normal_crafter_ppl.py +494 -0
normalcrafter/unet.py +368 -0
normalcrafter/utils.py +64 -0
requirements.txt +11 -0
run.py +174 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.gif filter=lfs diff=lfs merge=lfs -text
+*.mp4 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,175 @@

+### Python template
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+#
+.gradio
+.github
+demo_output
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+/logs
+/gin-config
+*.json
+/eval/*csv
+*__pycache__
+scripts/
+eval/
+*.DS_Store
+benchmark/datasets

LICENSE ADDED Viewed

	@@ -0,0 +1,32 @@

+Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved. The below software in this distribution may have been modified by THL A29 Limited ("Tencent Modifications").
+License Terms of the inference code of NormalCrafter:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this Software and associated documentation files, to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sublicense copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+- You agree to use the NormalCrafter only for academic, research and education purposes, and refrain from using it for any commercial or production purposes under any circumstances.
+- The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+For avoidance of doubts, “Software” means the NormalCrafter model inference code and weights made available under this license excluding any pre-trained data and other AI components.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+Other dependencies and licenses:
+Open Source Software Licensed under the MIT License:
+--------------------------------------------------------------------
+1. Stability AI - Code
+Copyright (c) 2023 Stability AI
+Terms of the MIT License:
+--------------------------------------------------------------------
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+**You may find the code license of Stability AI at the following links: https://github.com/Stability-AI/generative-models/blob/main/LICENSE-CODE

README.md CHANGED Viewed

@@ -1,14 +1,65 @@
 ---
 title: NormalCrafter
-emoji: 📉
-colorFrom: green
-colorTo: pink
-sdk: gradio
-sdk_version: 5.23.1
 app_file: app.py
-pinned: false
-license: apache-2.0
-short_description: NormalCrafter
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: NormalCrafter
 app_file: app.py
+sdk: gradio
+sdk_version: 5.23.2
 ---
+## ___***NormalCrafter: Learning Temporally Consistent Video Normal from Video Diffusion Priors***___
+_**[Yanrui Bin<sup>1</sup>](https://scholar.google.com/citations?user=_9fN3mEAAAAJ&hl=zh-CN),[Wenbo Hu<sup>2*](https://wbhu.github.io),
+[Haoyuan Wang<sup>3](https://www.whyy.site/),
+[Xinya Chen<sup>3](https://xinyachen21.github.io/),
+[Bing Wang<sup>2 &dagger;</sup>](https://bingcs.github.io/)**_
+<br><br>
+<sup>1</sup>Spatial Intelligence Group, The Hong Kong Polytechnic University
+<sup>2</sup>Tencent AI Lab
+<sup>3</sup>City University of Hong Kong
+<sup>4</sup>Huazhong University of Science and Technology
+</div>
+## 🔆 Notice
+We recommend that everyone use English to communicate on issues, as this helps developers from around the world discuss, share experiences, and answer questions together.
+For business licensing and other related inquiries, don't hesitate to contact `binyanrui@gmail.com`.
+## 🔆 Introduction
+🤗 If you find NormalCrafter useful, **please help ⭐ this repo**, which is important to Open-Source projects. Thanks!
+🔥 NormalCrafter can generate temporally consistent normal sequences
+with fine-grained details from open-world videos with arbitrary lengths.
+- `[24-04-01]` 🔥🔥🔥 **NormalCrafter** is released now, have fun!
+## 🚀 Quick Start
+### 🤖 Gradio Demo
+- Online demo: [NormalCrafter](https://huggingface.co/spaces/Yanrui95/NormalCrafter)
+- Local demo:
+    ```bash
+    gradio app.py
+    ```
+### 🛠️ Installation
+1. Clone this repo:
+```bash
+git clone git@github.com:Binyr/NormalCrafter.git
+```
+2. Install dependencies (please refer to [requirements.txt](requirements.txt)):
+```bash
+pip install -r requirements.txt
+```
+### 🤗 Model Zoo
+[NormalCrafter](https://huggingface.co/Yanrui95/NormalCrafter) is available in the Hugging Face Model Hub.
+### 🏃‍♂️ Inference
+#### 1. High-resolution inference, requires a GPU with ~20GB memory for 1024x576 resolution:
+```bash
+python run.py  --video-path examples/example_01.mp4
+```
+#### 2. Low-resolution inference requires a GPU with ~6GB memory for 512x256 resolution:
+```bash
+python run.py  --video-path examples/example_01.mp4 --max-res 512
+```

app.py ADDED Viewed

	@@ -0,0 +1,210 @@

+import gc
+import os
+import numpy as np
+import spaces
+import gradio as gr
+import torch
+from diffusers.training_utils import set_seed
+from diffusers import AutoencoderKLTemporalDecoder
+from normalcrafter.normal_crafter_ppl import NormalCrafterPipeline
+from normalcrafter.unet import DiffusersUNetSpatioTemporalConditionModelNormalCrafter
+import uuid
+import random
+from huggingface_hub import hf_hub_download
+from normalcrafter.utils import read_video_frames, vis_sequence_normal, save_video
+examples = [
+    ["examples/example_01.mp4", 1024, -1, -1],
+    ["examples/example_02.mp4", 1024, -1, -1],
+    ["examples/example_03.mp4", 1024, -1, -1],
+    ["examples/example_04.mp4", 1024, -1, -1],
+    ["examples/example_05.mp4", 1024, -1, -1],
+    ["examples/example_06.mp4", 1024, -1, -1],
+]
+pretrained_model_name_or_path = "Yanrui95/NormalCrafter"
+weight_dtype = torch.float16
+unet = DiffusersUNetSpatioTemporalConditionModelNormalCrafter.from_pretrained(
+    pretrained_model_name_or_path,
+    subfolder="unet",
+    low_cpu_mem_usage=True,
+)
+vae = AutoencoderKLTemporalDecoder.from_pretrained(
+        pretrained_model_name_or_path, subfolder="vae")
+vae.to(dtype=weight_dtype)
+unet.to(dtype=weight_dtype)
+pipe = NormalCrafterPipeline.from_pretrained(
+    "stabilityai/stable-video-diffusion-img2vid-xt",
+    unet=unet,
+    vae=vae,
+    torch_dtype=weight_dtype,
+    variant="fp16",
+)
+pipe.to("cuda")
+@spaces.GPU(duration=120)
+def infer_depth(
+    video: str,
+    max_res: int = 1024,
+    process_length: int = -1,
+    target_fps: int = -1,
+    #
+    save_folder: str = "./demo_output",
+    window_size: int = 14,
+    time_step_size: int = 10,
+    decode_chunk_size: int = 7,
+    seed: int = 42,
+    save_npz: bool = False,
+):
+    set_seed(seed)
+    pipe.enable_xformers_memory_efficient_attention()
+    frames, target_fps = read_video_frames(video, process_length, target_fps, max_res)
+    # inference the depth map using the DepthCrafter pipeline
+    with torch.inference_mode():
+        res = pipe(
+            frames,
+            decode_chunk_size=decode_chunk_size,
+            time_step_size=time_step_size,
+            window_size=window_size,
+        ).frames[0]
+    # visualize the depth map and save the results
+    vis = vis_sequence_normal(res)
+    # save the depth map and visualization with the target FPS
+    save_path = os.path.join(save_folder, os.path.splitext(os.path.basename(video))[0])
+    print(f"==> saving results to {save_path}")
+    os.makedirs(os.path.dirname(save_path), exist_ok=True)
+    if save_npz:
+        np.savez_compressed(save_path + ".npz", normal=res)
+    save_video(vis, save_path + "_vis.mp4", fps=target_fps)
+    save_video(frames, save_path + "_input.mp4", fps=target_fps)
+    # clear the cache for the next video
+    gc.collect()
+    torch.cuda.empty_cache()
+    return [
+        save_path + "_input.mp4",
+        save_path + "_vis.mp4",
+    ]
+def construct_demo():
+    with gr.Blocks(analytics_enabled=False) as depthcrafter_iface:
+        gr.Markdown(
+            """
+            <div align='center'> <h1> NormalCrafter: Learning Temporally Consistent Video Normal from Video Diffusion Priors </span> </h1> \
+                    <a style='font-size:18px;color: #000000'>If you find NormalCrafter useful, please help ⭐ the </a>\
+                    <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Binyr/NormalCrafter'>[Github Repo]</a>\
+                    <a style='font-size:18px;color: #000000'>, which is important to Open-Source projects. Thanks!</a>\
+                        <a style='font-size:18px;color: #000000' href='https://arxiv.org/abs/2409.02095'> [ArXiv] </a>\
+                        <a style='font-size:18px;color: #000000' href='https://normalcrafter.github.io/'> [Project Page] </a> </div>
+            """
+        )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                input_video = gr.Video(label="Input Video")
+            # with gr.Tab(label="Output"):
+            with gr.Column(scale=2):
+                with gr.Row(equal_height=True):
+                    output_video_1 = gr.Video(
+                        label="Preprocessed video",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=5,
+                    )
+                    output_video_2 = gr.Video(
+                        label="Generated Depth Video",
+                        interactive=False,
+                        autoplay=True,
+                        loop=True,
+                        show_share_button=True,
+                        scale=5,
+                    )
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1):
+                with gr.Row(equal_height=False):
+                    with gr.Accordion("Advanced Settings", open=False):
+                        max_res = gr.Slider(
+                            label="max resolution",
+                            minimum=512,
+                            maximum=1024,
+                            value=1024,
+                            step=64,
+                        )
+                        process_length = gr.Slider(
+                            label="process length",
+                            minimum=-1,
+                            maximum=280,
+                            value=60,
+                            step=1,
+                        )
+                        process_target_fps = gr.Slider(
+                            label="target FPS",
+                            minimum=-1,
+                            maximum=30,
+                            value=15,
+                            step=1,
+                        )
+                    generate_btn = gr.Button("Generate")
+            with gr.Column(scale=2):
+                pass
+        gr.Examples(
+            examples=examples,
+            inputs=[
+                input_video,
+                max_res,
+                process_length,
+                process_target_fps,
+            ],
+            outputs=[output_video_1, output_video_2],
+            fn=infer_depth,
+            cache_examples="lazy",
+        )
+        # gr.Markdown(
+        #     """
+        #     <span style='font-size:18px;color: #E7CCCC'>Note:
+        #     For time quota consideration, we set the default parameters to be more efficient here,
+        #     with a trade-off of shorter video length and slightly lower quality.
+        #     You may adjust the parameters according to our
+        #     <a style='font-size:18px;color: #FF5DB0' href='https://github.com/Tencent/DepthCrafter'>[Github Repo]</a>
+        #      for better results if you have enough time quota.
+        #     </span>
+        #     """
+        # )
+        generate_btn.click(
+            fn=infer_depth,
+            inputs=[
+                input_video,
+                max_res,
+                process_length,
+                process_target_fps,
+            ],
+            outputs=[output_video_1, output_video_2],
+        )
+    return depthcrafter_iface
+if __name__ == "__main__":
+    demo = construct_demo()
+    demo.queue()
+    # demo.launch(server_name="0.0.0.0", server_port=12345, debug=True, share=False)
+    demo.launch(share=True)

examples/example_01.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3eb7fefd157bd9b403cf0b524c7c4f3cb6d9f82b9d6a48eba2146412fc9e64a2
+size 5727137

examples/example_02.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea3c4e4c8cd9682d92c25170d8df333fead210118802fbe22198dde478dc5489
+size 3150525

examples/example_03.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5d332877a98bb41ff86a639139a03e383e91880bca722bba7e2518878fca54f6
+size 3013435

examples/example_04.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b2aa4962216adce71b1c47f395be435b23105df35f3892646e237b935ac1c74f
+size 3591374

examples/example_05.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e8d2319060f9a1d3cfcb9de317e4a5b138657fd741c530ed3983f6565c2eda44
+size 3553683

examples/example_06.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e3a2619b029129f34884c761cc278b6842620bfed96d4bb52c8aa07bc1d82a8b
+size 5596872

normalcrafter/__init__.py ADDED Viewed

File without changes

normalcrafter/normal_crafter_ppl.py ADDED Viewed

	@@ -0,0 +1,494 @@

+from dataclasses import dataclass
+from typing import Callable, Dict, List, Optional, Union
+import numpy as np
+import PIL.Image
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+import math
+from diffusers.utils import BaseOutput, logging
+from diffusers.utils.torch_utils import is_compiled_module, randn_tensor
+from diffusers import DiffusionPipeline
+from diffusers.pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import StableVideoDiffusionPipelineOutput, StableVideoDiffusionPipeline
+from PIL import Image
+import cv2
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class NormalCrafterPipeline(StableVideoDiffusionPipeline):
+    def _encode_image(self, image, device, num_videos_per_prompt, do_classifier_free_guidance, scale=1, image_size=None):
+        dtype = next(self.image_encoder.parameters()).dtype
+        if not isinstance(image, torch.Tensor):
+            image = self.video_processor.pil_to_numpy(image) # (0, 255) -> (0, 1)
+            image = self.video_processor.numpy_to_pt(image) # (n, h, w, c) -> (n, c, h, w)
+            # We normalize the image before resizing to match with the original implementation.
+            # Then we unnormalize it after resizing.
+            pixel_values = image
+            B, C, H, W = pixel_values.shape
+            patches = [pixel_values]
+            # patches = []
+            for i in range(1, scale):
+                num_patches_HW_this_level = i + 1
+                patch_H = H // num_patches_HW_this_level + 1
+                patch_W = W // num_patches_HW_this_level + 1
+                for j in range(num_patches_HW_this_level):
+                    for k in range(num_patches_HW_this_level):
+                        patches.append(pixel_values[:, :, j*patch_H:(j+1)*patch_H, k*patch_W:(k+1)*patch_W])
+            def encode_image(image):
+                image = image * 2.0 - 1.0
+                if image_size is not None:
+                    image = _resize_with_antialiasing(image, image_size)
+                else:
+                    image = _resize_with_antialiasing(image, (224, 224))
+                image = (image + 1.0) / 2.0
+                # Normalize the image with for CLIP input
+                image = self.feature_extractor(
+                    images=image,
+                    do_normalize=True,
+                    do_center_crop=False,
+                    do_resize=False,
+                    do_rescale=False,
+                    return_tensors="pt",
+                ).pixel_values
+                image = image.to(device=device, dtype=dtype)
+                image_embeddings = self.image_encoder(image).image_embeds
+                if len(image_embeddings.shape) < 3:
+                    image_embeddings = image_embeddings.unsqueeze(1)
+                return image_embeddings
+            image_embeddings = []
+            for patch in patches:
+                image_embeddings.append(encode_image(patch))
+            image_embeddings = torch.cat(image_embeddings, dim=1)
+        # duplicate image embeddings for each generation per prompt, using mps friendly method
+        # import pdb
+        # pdb.set_trace()
+        bs_embed, seq_len, _ = image_embeddings.shape
+        image_embeddings = image_embeddings.repeat(1, num_videos_per_prompt, 1)
+        image_embeddings = image_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        if do_classifier_free_guidance:
+            negative_image_embeddings = torch.zeros_like(image_embeddings)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            image_embeddings = torch.cat([negative_image_embeddings, image_embeddings])
+        return image_embeddings
+    def ecnode_video_vae(self, images, chunk_size: int = 14):
+        if isinstance(images, list):
+            width, height = images[0].size
+        else:
+            height, width = images[0].shape[:2]
+        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float32)
+        device = self._execution_device
+        images = self.video_processor.preprocess_video(images, height=height, width=width).to(device, self.vae.dtype) # torch type in range(-1, 1) with (1,3,h,w)
+        images = images.squeeze(0) # from (1, c, t, h, w) -> (c, t, h, w)
+        images = images.permute(1,0,2,3) # c, t, h, w -> (t, c, h, w)
+        video_latents = []
+        # chunk_size = 14
+        for i in range(0, images.shape[0], chunk_size):
+            video_latents.append(self.vae.encode(images[i : i + chunk_size]).latent_dist.mode())
+        image_latents = torch.cat(video_latents)
+        # cast back to fp16 if needed
+        if needs_upcasting:
+            self.vae.to(dtype=torch.float16)
+        return image_latents
+    def pad_image(self, images, scale=64):
+        def get_pad(newW, W):
+            pad_W = (newW - W) // 2
+            if W % 2 == 1:
+                pad_Ws = [pad_W, pad_W + 1]
+            else:
+                pad_Ws = [pad_W, pad_W]
+            return pad_Ws
+        if type(images[0]) is np.ndarray:
+            H, W = images[0].shape[:2]
+        else:
+            W, H = images[0].size
+        if W % scale == 0 and H % scale == 0:
+            return images, None
+        newW = int(np.ceil(W / scale) * scale)
+        newH = int(np.ceil(H / scale) * scale)
+        pad_Ws = get_pad(newW, W)
+        pad_Hs = get_pad(newH, H)
+        new_images = []
+        for image in images:
+            if type(image) is np.ndarray:
+                image = cv2.copyMakeBorder(image, *pad_Hs, *pad_Ws, cv2.BORDER_CONSTANT, value=(1.,1.,1.))
+                new_images.append(image)
+            else:
+                image = np.array(image)
+                image = cv2.copyMakeBorder(image, *pad_Hs, *pad_Ws, cv2.BORDER_CONSTANT, value=(255,255,255))
+                new_images.append(Image.fromarray(image))
+        return new_images, pad_Hs+pad_Ws
+    def unpad_image(self, v, pad_HWs):
+        t, b, l, r = pad_HWs
+        if t > 0 or b > 0:
+            v = v[:, :, t:-b]
+        if l > 0 or r > 0:
+            v = v[:, :, :, l:-r]
+        return v
+    @torch.no_grad()
+    def __call__(
+        self,
+        images: Union[PIL.Image.Image, List[PIL.Image.Image], torch.FloatTensor],
+        decode_chunk_size: Optional[int] = None,
+        time_step_size: Optional[int] = 1,
+        window_size: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        return_dict: bool = True
+    ):
+        images, pad_HWs = self.pad_image(images)
+        # 0. Default height and width to unet
+        width, height = images[0].size
+        num_frames = len(images)
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(images, height, width)
+        # 2. Define call parameters
+        batch_size = 1
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        self._guidance_scale = 1.0
+        num_videos_per_prompt = 1
+        do_classifier_free_guidance = False
+        num_inference_steps = 1
+        fps = 7
+        motion_bucket_id = 127
+        noise_aug_strength = 0.
+        num_videos_per_prompt = 1
+        output_type = "np"
+        data_keys = ["normal"]
+        use_linear_merge = True
+        determineTrain = True
+        encode_image_scale = 1
+        encode_image_WH = None
+        decode_chunk_size = decode_chunk_size if decode_chunk_size is not None else 7
+        # 3. Encode input image using using clip. (num_image * num_videos_per_prompt, 1, 1024)
+        image_embeddings = self._encode_image(images, device, num_videos_per_prompt, do_classifier_free_guidance=do_classifier_free_guidance, scale=encode_image_scale, image_size=encode_image_WH)
+        # 4. Encode input image using VAE
+        image_latents = self.ecnode_video_vae(images, chunk_size=decode_chunk_size).to(image_embeddings.dtype)
+        # image_latents [num_frames, channels, height, width] ->[1, num_frames, channels, height, width]
+        image_latents = image_latents.unsqueeze(0)
+        # 5. Get Added Time IDs
+        added_time_ids = self._get_add_time_ids(
+            fps,
+            motion_bucket_id,
+            noise_aug_strength,
+            image_embeddings.dtype,
+            batch_size,
+            num_videos_per_prompt,
+            do_classifier_free_guidance,
+        )
+        added_time_ids = added_time_ids.to(device)
+        # get Start and End frame idx for each window
+        def get_ses(num_frames):
+            ses = []
+            for i in range(0, num_frames, time_step_size):
+                ses.append([i, i+window_size])
+            num_to_remain = 0
+            for se in ses:
+                if se[1] > num_frames:
+                    continue
+                num_to_remain += 1
+            ses = ses[:num_to_remain]
+            if ses[-1][-1] < num_frames:
+                ses.append([num_frames - window_size, num_frames])
+            return ses
+        ses = get_ses(num_frames)
+        pred = None
+        for i, se in enumerate(ses):
+            window_num_frames = window_size
+            window_image_embeddings = image_embeddings[se[0]:se[1]]
+            window_image_latents = image_latents[:, se[0]:se[1]]
+            window_added_time_ids = added_time_ids
+            # import pdb
+            # pdb.set_trace()
+            if i == 0 or time_step_size == window_size:
+                to_replace_latents = None
+            else:
+                last_se = ses[i-1]
+                num_to_replace_latents = last_se[1] - se[0]
+                to_replace_latents = pred[:, -num_to_replace_latents:]
+            latents = self.generate(
+                num_inference_steps,
+                device,
+                batch_size,
+                num_videos_per_prompt,
+                window_num_frames,
+                height,
+                width,
+                window_image_embeddings,
+                generator,
+                determineTrain,
+                to_replace_latents,
+                do_classifier_free_guidance,
+                window_image_latents,
+                window_added_time_ids
+            )
+            # merge last_latents and current latents in overlap window
+            if to_replace_latents is not None and use_linear_merge:
+                num_img_condition = to_replace_latents.shape[1]
+                weight = torch.linspace(1., 0., num_img_condition+2)[1:-1].to(device)
+                weight = weight[None, :, None, None, None]
+                latents[:, :num_img_condition] = to_replace_latents * weight + latents[:, :num_img_condition] * (1 - weight)
+            if pred is None:
+                pred = latents
+            else:
+                pred = torch.cat([pred[:, :se[0]], latents], dim=1)
+        if not output_type == "latent":
+            # cast back to fp16 if needed
+            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
+            if needs_upcasting:
+                self.vae.to(dtype=torch.float16)
+            # latents has shape (1, num_frames, 12, h, w)
+            def decode_latents(latents, num_frames, decode_chunk_size):
+                frames = self.decode_latents(latents, num_frames, decode_chunk_size) # in range(-1, 1)
+                frames = self.video_processor.postprocess_video(video=frames, output_type="np")
+                frames = frames * 2 - 1 # from range(0, 1) -> range(-1, 1)
+                return frames
+            frames = decode_latents(pred, num_frames, decode_chunk_size)
+            if pad_HWs is not None:
+                frames = self.unpad_image(frames, pad_HWs)
+        else:
+            frames = pred
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return frames
+        return StableVideoDiffusionPipelineOutput(frames=frames)
+    def generate(
+            self,
+            num_inference_steps,
+            device,
+            batch_size,
+            num_videos_per_prompt,
+            num_frames,
+            height,
+            width,
+            image_embeddings,
+            generator,
+            determineTrain,
+            to_replace_latents,
+            do_classifier_free_guidance,
+            image_latents,
+            added_time_ids,
+            latents=None,
+        ):
+        # 6. Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # 7. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_frames,
+            num_channels_latents,
+            height,
+            width,
+            image_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        if determineTrain:
+            latents[...] = 0.
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # replace part of latents with conditons. ToDo: t embedding should also replace
+                if to_replace_latents is not None:
+                    num_img_condition = to_replace_latents.shape[1]
+                    if not determineTrain:
+                        _noise = randn_tensor(to_replace_latents.shape, generator=generator, device=device, dtype=image_embeddings.dtype)
+                        noisy_to_replace_latents = self.scheduler.add_noise(to_replace_latents, _noise, t.unsqueeze(0))
+                        latents[:, :num_img_condition] = noisy_to_replace_latents
+                    else:
+                        latents[:, :num_img_condition] = to_replace_latents
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                timestep = t
+                # Concatenate image_latents over channels dimention
+                latent_model_input = torch.cat([latent_model_input, image_latents], dim=2)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    timestep,
+                    encoder_hidden_states=image_embeddings,
+                    added_time_ids=added_time_ids,
+                    return_dict=False,
+                )[0]
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                scheduler_output = self.scheduler.step(noise_pred, t, latents)
+                latents = scheduler_output.prev_sample
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+        return latents
+# resizing utils
+# TODO: clean up later
+def _resize_with_antialiasing(input, size, interpolation="bicubic", align_corners=True):
+    h, w = input.shape[-2:]
+    factors = (h / size[0], w / size[1])
+    # First, we have to determine sigma
+    # Taken from skimage: https://github.com/scikit-image/scikit-image/blob/v0.19.2/skimage/transform/_warps.py#L171
+    sigmas = (
+        max((factors[0] - 1.0) / 2.0, 0.001),
+        max((factors[1] - 1.0) / 2.0, 0.001),
+    )
+    # Now kernel size. Good results are for 3 sigma, but that is kind of slow. Pillow uses 1 sigma
+    # https://github.com/python-pillow/Pillow/blob/master/src/libImaging/Resample.c#L206
+    # But they do it in the 2 passes, which gives better results. Let's try 2 sigmas for now
+    ks = int(max(2.0 * 2 * sigmas[0], 3)), int(max(2.0 * 2 * sigmas[1], 3))
+    # Make sure it is odd
+    if (ks[0] % 2) == 0:
+        ks = ks[0] + 1, ks[1]
+    if (ks[1] % 2) == 0:
+        ks = ks[0], ks[1] + 1
+    input = _gaussian_blur2d(input, ks, sigmas)
+    output = torch.nn.functional.interpolate(input, size=size, mode=interpolation, align_corners=align_corners)
+    return output
+def _compute_padding(kernel_size):
+    """Compute padding tuple."""
+    # 4 or 6 ints:  (padding_left, padding_right,padding_top,padding_bottom)
+    # https://pytorch.org/docs/stable/nn.html#torch.nn.functional.pad
+    if len(kernel_size) < 2:
+        raise AssertionError(kernel_size)
+    computed = [k - 1 for k in kernel_size]
+    # for even kernels we need to do asymmetric padding :(
+    out_padding = 2 * len(kernel_size) * [0]
+    for i in range(len(kernel_size)):
+        computed_tmp = computed[-(i + 1)]
+        pad_front = computed_tmp // 2
+        pad_rear = computed_tmp - pad_front
+        out_padding[2 * i + 0] = pad_front
+        out_padding[2 * i + 1] = pad_rear
+    return out_padding
+def _filter2d(input, kernel):
+    # prepare kernel
+    b, c, h, w = input.shape
+    tmp_kernel = kernel[:, None, ...].to(device=input.device, dtype=input.dtype)
+    tmp_kernel = tmp_kernel.expand(-1, c, -1, -1)
+    height, width = tmp_kernel.shape[-2:]
+    padding_shape: list[int] = _compute_padding([height, width])
+    input = torch.nn.functional.pad(input, padding_shape, mode="reflect")
+    # kernel and input tensor reshape to align element-wise or batch-wise params
+    tmp_kernel = tmp_kernel.reshape(-1, 1, height, width)
+    input = input.view(-1, tmp_kernel.size(0), input.size(-2), input.size(-1))
+    # convolve the tensor with the kernel.
+    output = torch.nn.functional.conv2d(input, tmp_kernel, groups=tmp_kernel.size(0), padding=0, stride=1)
+    out = output.view(b, c, h, w)
+    return out
+def _gaussian(window_size: int, sigma):
+    if isinstance(sigma, float):
+        sigma = torch.tensor([[sigma]])
+    batch_size = sigma.shape[0]
+    x = (torch.arange(window_size, device=sigma.device, dtype=sigma.dtype) - window_size // 2).expand(batch_size, -1)
+    if window_size % 2 == 0:
+        x = x + 0.5
+    gauss = torch.exp(-x.pow(2.0) / (2 * sigma.pow(2.0)))
+    return gauss / gauss.sum(-1, keepdim=True)
+def _gaussian_blur2d(input, kernel_size, sigma):
+    if isinstance(sigma, tuple):
+        sigma = torch.tensor([sigma], dtype=input.dtype)
+    else:
+        sigma = sigma.to(dtype=input.dtype)
+    ky, kx = int(kernel_size[0]), int(kernel_size[1])
+    bs = sigma.shape[0]
+    kernel_x = _gaussian(kx, sigma[:, 1].view(bs, 1))
+    kernel_y = _gaussian(ky, sigma[:, 0].view(bs, 1))
+    out_x = _filter2d(input, kernel_x[..., None, :])
+    out = _filter2d(out_x, kernel_y[..., None])
+    return out

normalcrafter/unet.py ADDED Viewed

	@@ -0,0 +1,368 @@

+from diffusers import UNetSpatioTemporalConditionModel
+from diffusers.models.unets.unet_spatio_temporal_condition import UNetSpatioTemporalConditionOutput
+from diffusers.utils import is_torch_version
+import torch
+from typing import Any, Dict, Optional, Tuple, Union
+def create_custom_forward(module, return_dict=None):
+    def custom_forward(*inputs):
+        if return_dict is not None:
+            return module(*inputs, return_dict=return_dict)
+        else:
+            return module(*inputs)
+    return custom_forward
+CKPT_KWARGS = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+class DiffusersUNetSpatioTemporalConditionModelNormalCrafter(UNetSpatioTemporalConditionModel):
+    @staticmethod
+    def forward_crossattn_down_block_dino(
+        module,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        dino_down_block_res_samples = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        output_states = ()
+        self = module
+        blocks = list(zip(self.resnets, self.attentions))
+        for resnet, attn in blocks:
+            if self.training and self.gradient_checkpointing:  # TODO
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    image_only_indicator,
+                    **CKPT_KWARGS,
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn),
+                    hidden_states,
+                    encoder_hidden_states,
+                    image_only_indicator,
+                    False,
+                    **CKPT_KWARGS,
+                )[0]
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+                hidden_states = attn(
+                    hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    image_only_indicator=image_only_indicator,
+                    return_dict=False,
+                )[0]
+            if dino_down_block_res_samples is not None:
+                hidden_states += dino_down_block_res_samples.pop(0)
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+                if dino_down_block_res_samples is not None:
+                    hidden_states += dino_down_block_res_samples.pop(0)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+    @staticmethod
+    def forward_down_block_dino(
+        module,
+        hidden_states: torch.Tensor,
+        temb: Optional[torch.Tensor] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        dino_down_block_res_samples = None,
+    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, ...]]:
+        self = module
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                if is_torch_version(">=", "1.11.0"):
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                        use_reentrant=False,
+                    )
+                else:
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        image_only_indicator,
+                    )
+            else:
+                hidden_states = resnet(
+                    hidden_states,
+                    temb,
+                    image_only_indicator=image_only_indicator,
+                )
+            if dino_down_block_res_samples is not None:
+                hidden_states += dino_down_block_res_samples.pop(0)
+            output_states = output_states + (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+                if dino_down_block_res_samples is not None:
+                    hidden_states += dino_down_block_res_samples.pop(0)
+            output_states = output_states + (hidden_states,)
+        return hidden_states, output_states
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        added_time_ids: torch.Tensor,
+        return_dict: bool = True,
+        image_controlnet_down_block_res_samples = None,
+        image_controlnet_mid_block_res_sample = None,
+        dino_down_block_res_samples = None,
+    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
+        r"""
+        The [`UNetSpatioTemporalConditionModel`] forward method.
+        Args:
+            sample (`torch.FloatTensor`):
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
+            encoder_hidden_states (`torch.FloatTensor`):
+                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
+            added_time_ids: (`torch.FloatTensor`):
+                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
+                embeddings and added to the time embeddings.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
+                tuple.
+        Returns:
+            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
+                a `tuple` is returned where the first element is the sample tensor.
+        """
+        if not hasattr(self, "custom_gradient_checkpointing"):
+            self.custom_gradient_checkpointing = False
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        batch_size, num_frames = sample.shape[:2]
+        if len(timesteps.shape) == 1:
+            timesteps = timesteps.expand(batch_size)
+        else:
+            timesteps = timesteps.reshape(batch_size * num_frames)
+        t_emb = self.time_proj(timesteps) # (B, C)
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+        emb = self.time_embedding(t_emb) # (B, C)
+        time_embeds = self.add_time_proj(added_time_ids.flatten())
+        time_embeds = time_embeds.reshape((batch_size, -1))
+        time_embeds = time_embeds.to(emb.dtype)
+        aug_emb = self.add_embedding(time_embeds)
+        if emb.shape[0] == 1:
+            emb = emb + aug_emb
+            # Repeat the embeddings num_video_frames times
+            # emb: [batch, channels] -> [batch * frames, channels]
+            emb = emb.repeat_interleave(num_frames, dim=0)
+        else:
+            aug_emb = aug_emb.repeat_interleave(num_frames, dim=0)
+            emb = emb + aug_emb
+        # Flatten the batch and frames dimensions
+        # sample: [batch, frames, channels, height, width] -> [batch * frames, channels, height, width]
+        sample = sample.flatten(0, 1)
+        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
+        # here, our encoder_hidden_states is [batch * frames, 1, channels]
+        if not sample.shape[0] == encoder_hidden_states.shape[0]:
+            encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        # 2. pre-process
+        sample = self.conv_in(sample)
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        if dino_down_block_res_samples is not None:
+            dino_down_block_res_samples = [x for x in dino_down_block_res_samples]
+            sample += dino_down_block_res_samples.pop(0)
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if dino_down_block_res_samples is None:
+                if self.custom_gradient_checkpointing:
+                    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                        sample, res_samples = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(downsample_block),
+                            sample,
+                            emb,
+                            encoder_hidden_states,
+                            image_only_indicator,
+                            **CKPT_KWARGS,
+                        )
+                    else:
+                        sample, res_samples = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(downsample_block),
+                            sample,
+                            emb,
+                            image_only_indicator,
+                            **CKPT_KWARGS,
+                        )
+                else:
+                    if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                        sample, res_samples = downsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            encoder_hidden_states=encoder_hidden_states,
+                            image_only_indicator=image_only_indicator,
+                        )
+                    else:
+                        sample, res_samples = downsample_block(
+                            hidden_states=sample,
+                            temb=emb,
+                            image_only_indicator=image_only_indicator,
+                        )
+            else:
+                if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                    sample, res_samples = self.forward_crossattn_down_block_dino(
+                        downsample_block,
+                        sample,
+                        emb,
+                        encoder_hidden_states,
+                        image_only_indicator,
+                        dino_down_block_res_samples,
+                    )
+                else:
+                    sample, res_samples = self.forward_down_block_dino(
+                        downsample_block,
+                        sample,
+                        emb,
+                        image_only_indicator,
+                        dino_down_block_res_samples,
+                    )
+            down_block_res_samples += res_samples
+        if image_controlnet_down_block_res_samples is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, image_controlnet_down_block_res_sample in zip(
+                down_block_res_samples, image_controlnet_down_block_res_samples
+            ):
+                down_block_res_sample = (down_block_res_sample + image_controlnet_down_block_res_sample) / 2
+                new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # 4. mid
+        if self.custom_gradient_checkpointing:
+            sample = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.mid_block),
+                sample,
+                emb,
+                encoder_hidden_states,
+                image_only_indicator,
+                **CKPT_KWARGS,
+        )
+        else:
+            sample = self.mid_block(
+                hidden_states=sample,
+                temb=emb,
+                encoder_hidden_states=encoder_hidden_states,
+                image_only_indicator=image_only_indicator,
+            )
+        if image_controlnet_mid_block_res_sample is not None:
+            sample = (sample + image_controlnet_mid_block_res_sample) / 2
+        # 5. up
+        mid_up_block_out_samples = [sample, ]
+        down_block_out_sampels = []
+        for i, upsample_block in enumerate(self.up_blocks):
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            down_block_out_sampels.append(res_samples[-1])
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                if self.custom_gradient_checkpointing:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(upsample_block),
+                        sample,
+                        res_samples,
+                        emb,
+                        encoder_hidden_states,
+                        image_only_indicator,
+                        **CKPT_KWARGS
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        encoder_hidden_states=encoder_hidden_states,
+                        image_only_indicator=image_only_indicator,
+                    )
+            else:
+                if self.custom_gradient_checkpointing:
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(upsample_block),
+                        sample,
+                        res_samples,
+                        emb,
+                        image_only_indicator,
+                        **CKPT_KWARGS
+                    )
+                else:
+                    sample = upsample_block(
+                        hidden_states=sample,
+                        temb=emb,
+                        res_hidden_states_tuple=res_samples,
+                        image_only_indicator=image_only_indicator,
+                    )
+            mid_up_block_out_samples.append(sample)
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        if self.custom_gradient_checkpointing:
+            sample = torch.utils.checkpoint.checkpoint(
+                create_custom_forward(self.conv_out),
+                    sample,
+                    **CKPT_KWARGS
+                )
+        else:
+            sample = self.conv_out(sample)
+        # 7. Reshape back to original shape
+        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])
+        if not return_dict:
+            return (sample, down_block_out_sampels[::-1], mid_up_block_out_samples)
+        return UNetSpatioTemporalConditionOutput(sample=sample)

normalcrafter/utils.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from typing import Union, List
+import tempfile
+import numpy as np
+import PIL.Image
+import matplotlib.cm as cm
+import mediapy
+import torch
+from decord import VideoReader, cpu
+def read_video_frames(video_path, process_length, target_fps, max_res):
+    print("==> processing video: ", video_path)
+    vid = VideoReader(video_path, ctx=cpu(0))
+    print("==> original video shape: ", (len(vid), *vid.get_batch([0]).shape[1:]))
+    original_height, original_width = vid.get_batch([0]).shape[1:3]
+    if max(original_height, original_width) > max_res:
+        scale = max_res / max(original_height, original_width)
+        height = round(original_height * scale)
+        width = round(original_width * scale)
+    else:
+        height = original_height
+        width = original_width
+    vid = VideoReader(video_path, ctx=cpu(0), width=width, height=height)
+    fps = vid.get_avg_fps() if target_fps == -1 else target_fps
+    stride = round(vid.get_avg_fps() / fps)
+    stride = max(stride, 1)
+    frames_idx = list(range(0, len(vid), stride))
+    print(
+        f"==> downsampled shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}, with stride: {stride}"
+    )
+    if process_length != -1 and process_length < len(frames_idx):
+        frames_idx = frames_idx[:process_length]
+    print(
+        f"==> final processing shape: {len(frames_idx), *vid.get_batch([0]).shape[1:]}"
+    )
+    frames = vid.get_batch(frames_idx).asnumpy().astype(np.uint8)
+    frames = [PIL.Image.fromarray(x) for x in frames]
+    return frames, fps
+def save_video(
+    video_frames: Union[List[np.ndarray], List[PIL.Image.Image]],
+    output_video_path: str = None,
+    fps: int = 10,
+    crf: int = 18,
+) -> str:
+    if output_video_path is None:
+        output_video_path = tempfile.NamedTemporaryFile(suffix=".mp4").name
+    if isinstance(video_frames[0], np.ndarray):
+        video_frames = [(frame * 255).astype(np.uint8) for frame in video_frames]
+    elif isinstance(video_frames[0], PIL.Image.Image):
+        video_frames = [np.array(frame) for frame in video_frames]
+    mediapy.write_video(output_video_path, video_frames, fps=fps, crf=crf)
+    return output_video_path
+def vis_sequence_normal(normals: np.ndarray):
+    normals = normals.clip(-1., 1.)
+    normals = normals * 0.5 + 0.5
+    return normals

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch==2.0.1
+diffusers==0.29.1
+numpy==1.26.4
+matplotlib==3.8.4
+transformers==4.41.2
+accelerate==0.30.1
+xformers==0.0.20
+mediapy==1.2.0
+fire==0.6.0
+decord==0.6.0
+OpenEXR==3.2.4

run.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import gc
+import os
+import numpy as np
+import torch
+from diffusers.training_utils import set_seed
+from diffusers import AutoencoderKLTemporalDecoder
+from fire import Fire
+from normalcrafter.normal_crafter_ppl import NormalCrafterPipeline
+from normalcrafter.unet import DiffusersUNetSpatioTemporalConditionModelNormalCrafter
+from normalcrafter.utils import vis_sequence_normal, save_video, read_video_frames
+class DepthCrafterDemo:
+    def __init__(
+        self,
+        unet_path: str,
+        pre_train_path: str,
+        cpu_offload: str = "model",
+    ):
+        unet = DiffusersUNetSpatioTemporalConditionModelNormalCrafter.from_pretrained(
+            unet_path,
+            subfolder="unet",
+            low_cpu_mem_usage=True,
+        )
+        vae = AutoencoderKLTemporalDecoder.from_pretrained(
+            unet_path, subfolder="vae"
+        )
+        weight_dtype = torch.float16
+        vae.to(dtype=weight_dtype)
+        unet.to(dtype=weight_dtype)
+        # load weights of other components from the provided checkpoint
+        self.pipe = NormalCrafterPipeline.from_pretrained(
+            pre_train_path,
+            unet=unet,
+            vae=vae,
+            torch_dtype=weight_dtype,
+            variant="fp16",
+        )
+        # for saving memory, we can offload the model to CPU, or even run the model sequentially to save more memory
+        if cpu_offload is not None:
+            if cpu_offload == "sequential":
+                # This will slow, but save more memory
+                self.pipe.enable_sequential_cpu_offload()
+            elif cpu_offload == "model":
+                self.pipe.enable_model_cpu_offload()
+            else:
+                raise ValueError(f"Unknown cpu offload option: {cpu_offload}")
+        else:
+            self.pipe.to("cuda")
+        # enable attention slicing and xformers memory efficient attention
+        try:
+            self.pipe.enable_xformers_memory_efficient_attention()
+        except Exception as e:
+            print(e)
+            print("Xformers is not enabled")
+        # self.pipe.enable_attention_slicing()
+    def infer(
+        self,
+        video: str,
+        save_folder: str = "./demo_output",
+        window_size: int = 14,
+        time_step_size: int = 10,
+        process_length: int = 195,
+        decode_chunk_size: int = 7,
+        max_res: int = 1024,
+        dataset: str = "open",
+        target_fps: int = 15,
+        seed: int = 42,
+        save_npz: bool = False,
+    ):
+        set_seed(seed)
+        frames, target_fps = read_video_frames(
+            video,
+            process_length,
+            target_fps,
+            max_res,
+        )
+        # inference the depth map using the DepthCrafter pipeline
+        with torch.inference_mode():
+            res = self.pipe(
+                frames,
+                decode_chunk_size=decode_chunk_size,
+                time_step_size=time_step_size,
+                window_size=window_size,
+            ).frames[0]
+        # visualize the depth map and save the results
+        vis = vis_sequence_normal(res)
+        # save the depth map and visualization with the target FPS
+        save_path = os.path.join(
+            save_folder, os.path.splitext(os.path.basename(video))[0]
+        )
+        os.makedirs(os.path.dirname(save_path), exist_ok=True)
+        save_video(vis, save_path + "_vis.mp4", fps=target_fps)
+        save_video(frames, save_path + "_input.mp4", fps=target_fps)
+        if save_npz:
+            np.savez_compressed(save_path + ".npz", depth=res)
+        return [
+            save_path + "_input.mp4",
+            save_path + "_vis.mp4",
+        ]
+    def run(
+        self,
+        input_video,
+        num_denoising_steps,
+        guidance_scale,
+        max_res=1024,
+        process_length=195,
+    ):
+        res_path = self.infer(
+            input_video,
+            num_denoising_steps,
+            guidance_scale,
+            max_res=max_res,
+            process_length=process_length,
+        )
+        # clear the cache for the next video
+        gc.collect()
+        torch.cuda.empty_cache()
+        return res_path[:2]
+def main(
+    video_path: str,
+    save_folder: str = "./demo_output",
+    unet_path: str = "Yanrui95/NormalCrafter",
+    pre_train_path: str = "stabilityai/stable-video-diffusion-img2vid-xt",
+    process_length: int = -1,
+    cpu_offload: str = "model",
+    target_fps: int = -1,
+    seed: int = 42,
+    window_size: int = 14,
+    time_step_size: int = 10,
+    max_res: int = 1024,
+    dataset: str = "open",
+    save_npz: bool = False
+):
+    depthcrafter_demo = DepthCrafterDemo(
+        unet_path=unet_path,
+        pre_train_path=pre_train_path,
+        cpu_offload=cpu_offload,
+    )
+    # process the videos, the video paths are separated by comma
+    video_paths = video_path.split(",")
+    for video in video_paths:
+        depthcrafter_demo.infer(
+            video,
+            save_folder=save_folder,
+            window_size=window_size,
+            process_length=process_length,
+            time_step_size=time_step_size,
+            max_res=max_res,
+            dataset=dataset,
+            target_fps=target_fps,
+            seed=seed,
+            save_npz=save_npz,
+        )
+        # clear the cache for the next video
+        gc.collect()
+        torch.cuda.empty_cache()
+if __name__ == "__main__":
+    # running configs
+    # the most important arguments for memory saving are `cpu_offload`, `enable_xformers`, `max_res`, and `window_size`
+    # the most important arguments for trade-off between quality and speed are
+    # `num_inference_steps`, `guidance_scale`, and `max_res`
+    Fire(main)