Spaces:

uw-insight-lab
/

Probing-Vis-Literacy-of-VLMs

Paused

App Files Files Community

AustingDong commited on Mar 4

Commit

1ca9e3b

1 Parent(s): d95dc04

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitignore +421 -0
.gradio/certificate.pem +31 -0
Dockerfile +41 -0
LICENSE-CODE +21 -0
LICENSE-MODEL +91 -0
Makefile +99 -0
app.py +398 -0
demo/Janus_colab_demo.ipynb +0 -0
demo/app.py +224 -0
demo/app_janusflow.py +247 -0
demo/app_januspro.py +294 -0
demo/app_vqa.py +333 -0
demo/cam.py +486 -0
demo/demo.ipynb +0 -0
demo/demo_attn.ipynb +0 -0
demo/fastapi_app.py +178 -0
demo/fastapi_client.py +78 -0
demo/model_utils.py +208 -0
demo/modify_llama.py +11 -0
demo/visualize_architecture.ipynb +1715 -0
images/AreaChart.png +0 -0
images/BarChart.png +0 -0
images/BubbleChart.png +0 -0
images/Choropleth_New.png +0 -0
images/Histogram.png +0 -0
images/LineChart.png +0 -0
images/PieChart.png +0 -0
images/Scatterplot.png +0 -0
images/Stacked100.png +0 -0
images/StackedArea.png +0 -0
images/StackedBar.png +0 -0
images/TreeMap.png +0 -0
images/badge.svg +1 -0
images/cat_dog.png +0 -0
images/doge.png +0 -0
images/equation.png +0 -0
images/logo.png +0 -0
images/logo.svg +22 -0
images/pie_chart.png +0 -0
images/ve.png +0 -0
janus/__init__.py +31 -0
janus/janusflow/__init__.py +31 -0
janus/janusflow/models/__init__.py +28 -0
janus/janusflow/models/clip_encoder.py +122 -0
janus/janusflow/models/image_processing_vlm.py +208 -0
janus/janusflow/models/modeling_vlm.py +226 -0
janus/janusflow/models/processing_vlm.py +455 -0
janus/janusflow/models/siglip_vit.py +691 -0
janus/janusflow/models/uvit.py +714 -0
janus/models/__init__.py +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,421 @@

+##### Python.gitignore #####
+# Byte-compiled / optimized / DLL files
+**/__pycache__/
+*.pyc
+*.pyo
+*.pyd
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+wheelhouse/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+*.whl
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+docs/source/_build/
+_autosummary/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# ruff
+.ruff_cache/
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+##### macOS.gitignore #####
+# General
+.DS_Store
+.AppleDouble
+.LSOverride
+# Icon must end with two \r
+Icon
+# Thumbnails
+._*
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+.com.apple.timemachine.donotpresent
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+##### Linux.gitignore #####
+*~
+# Temporary files which can be created if a process still has a handle open of a deleted file
+.fuse_hidden*
+# KDE directory preferences
+.directory
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+# .nfs files are created when an open file is removed but is still being accessed
+.nfs*
+##### Windows.gitignore #####
+# Windows thumbnail cache files
+Thumbs.db
+Thumbs.db:encryptable
+ehthumbs.db
+ehthumbs_vista.db
+# Dump file
+*.stackdump
+# Folder config file
+[Dd]esktop.ini
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+# Windows Installer files
+*.cab
+*.msi
+*.msix
+*.msm
+*.msp
+# Windows shortcuts
+*.lnk
+##### Archives.gitignore #####
+# It's better to unpack these files and commit the raw source because
+# git has its own built in compression methods.
+*.7z
+*.jar
+*.rar
+*.zip
+*.gz
+*.gzip
+*.tgz
+*.bzip
+*.bzip2
+*.bz2
+*.xz
+*.lzma
+*.cab
+*.xar
+# Packing-only formats
+*.iso
+*.tar
+# Package management formats
+*.dmg
+*.xpi
+*.gem
+*.egg
+*.deb
+*.rpm
+*.msi
+*.msm
+*.msp
+*.txz
+##### Xcode.gitignore #####
+# Xcode
+#
+# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
+## User settings
+xcuserdata/
+## Compatibility with Xcode 8 and earlier (ignoring not required starting Xcode 9)
+*.xcscmblueprint
+*.xccheckout
+## Compatibility with Xcode 3 and earlier (ignoring not required starting Xcode 4)
+build/
+DerivedData/
+*.moved-aside
+*.pbxuser
+!default.pbxuser
+*.mode1v3
+!default.mode1v3
+*.mode2v3
+!default.mode2v3
+*.perspectivev3
+!default.perspectivev3
+## Gcc Patch
+/*.gcno
+##### JetBrains.gitignore #####
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+# User settings
+.idea/*
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+# Generated files
+.idea/**/contentModel.xml
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn. Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+# CMake
+cmake-build-*/
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+# File-based project format
+*.iws
+# IntelliJ
+out/
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+# JIRA plugin
+atlassian-ide-plugin.xml
+# Cursive Clojure plugin
+.idea/replstate.xml
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+# Editor-based Rest Client
+.idea/httpRequests
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+##### VisualStudioCode.gitignore #####
+.vscode/*
+# !.vscode/settings.json
+# !.vscode/tasks.json
+# !.vscode/launch.json
+!.vscode/extensions.json
+*.code-workspace
+# Local History for Visual Studio Code
+.history/
+##### Vim.gitignore #####
+# Swap
+.*.s[a-v][a-z]
+!*.svg  # comment out if you don't need vector files
+.*.sw[a-p]
+.s[a-rt-v][a-z]
+.ss[a-gi-z]
+.sw[a-p]
+# Session
+Session.vim
+Sessionx.vim
+# Temporary
+.netrwhist
+*~
+# Auto-generated tag files
+tags
+# Persistent undo
+[._]*.un~
+.vscode
+.github
+generated_samples/

.gradio/certificate.pem ADDED Viewed

	@@ -0,0 +1,31 @@

+-----BEGIN CERTIFICATE-----
+MIIFazCCA1OgAwIBAgIRAIIQz7DSQONZRGPgu2OCiwAwDQYJKoZIhvcNAQELBQAw
+TzELMAkGA1UEBhMCVVMxKTAnBgNVBAoTIEludGVybmV0IFNlY3VyaXR5IFJlc2Vh
+cmNoIEdyb3VwMRUwEwYDVQQDEwxJU1JHIFJvb3QgWDEwHhcNMTUwNjA0MTEwNDM4
+WhcNMzUwNjA0MTEwNDM4WjBPMQswCQYDVQQGEwJVUzEpMCcGA1UEChMgSW50ZXJu
+ZXQgU2VjdXJpdHkgUmVzZWFyY2ggR3JvdXAxFTATBgNVBAMTDElTUkcgUm9vdCBY
+MTCCAiIwDQYJKoZIhvcNAQEBBQADggIPADCCAgoCggIBAK3oJHP0FDfzm54rVygc
+h77ct984kIxuPOZXoHj3dcKi/vVqbvYATyjb3miGbESTtrFj/RQSa78f0uoxmyF+
+0TM8ukj13Xnfs7j/EvEhmkvBioZxaUpmZmyPfjxwv60pIgbz5MDmgK7iS4+3mX6U
+A5/TR5d8mUgjU+g4rk8Kb4Mu0UlXjIB0ttov0DiNewNwIRt18jA8+o+u3dpjq+sW
+T8KOEUt+zwvo/7V3LvSye0rgTBIlDHCNAymg4VMk7BPZ7hm/ELNKjD+Jo2FR3qyH
+B5T0Y3HsLuJvW5iB4YlcNHlsdu87kGJ55tukmi8mxdAQ4Q7e2RCOFvu396j3x+UC
+B5iPNgiV5+I3lg02dZ77DnKxHZu8A/lJBdiB3QW0KtZB6awBdpUKD9jf1b0SHzUv
+KBds0pjBqAlkd25HN7rOrFleaJ1/ctaJxQZBKT5ZPt0m9STJEadao0xAH0ahmbWn
+OlFuhjuefXKnEgV4We0+UXgVCwOPjdAvBbI+e0ocS3MFEvzG6uBQE3xDk3SzynTn
+jh8BCNAw1FtxNrQHusEwMFxIt4I7mKZ9YIqioymCzLq9gwQbooMDQaHWBfEbwrbw
+qHyGO0aoSCqI3Haadr8faqU9GY/rOPNk3sgrDQoo//fb4hVC1CLQJ13hef4Y53CI
+rU7m2Ys6xt0nUW7/vGT1M0NPAgMBAAGjQjBAMA4GA1UdDwEB/wQEAwIBBjAPBgNV
+HRMBAf8EBTADAQH/MB0GA1UdDgQWBBR5tFnme7bl5AFzgAiIyBpY9umbbjANBgkq
+hkiG9w0BAQsFAAOCAgEAVR9YqbyyqFDQDLHYGmkgJykIrGF1XIpu+ILlaS/V9lZL
+ubhzEFnTIZd+50xx+7LSYK05qAvqFyFWhfFQDlnrzuBZ6brJFe+GnY+EgPbk6ZGQ
+3BebYhtF8GaV0nxvwuo77x/Py9auJ/GpsMiu/X1+mvoiBOv/2X/qkSsisRcOj/KK
+NFtY2PwByVS5uCbMiogziUwthDyC3+6WVwW6LLv3xLfHTjuCvjHIInNzktHCgKQ5
+ORAzI4JMPJ+GslWYHb4phowim57iaztXOoJwTdwJx4nLCgdNbOhdjsnvzqvHu7Ur
+TkXWStAmzOVyyghqpZXjFaH3pO3JLF+l+/+sKAIuvtd7u+Nxe5AW0wdeRlN8NwdC
+jNPElpzVmbUq4JUagEiuTDkHzsxHpFKVK7q4+63SM1N95R1NbdWhscdCb+ZAJzVc
+oyi3B43njTOQ5yOf+1CceWxG1bQVs5ZufpsMljq4Ui0/1lvh+wjChP4kqKOJ2qxq
+4RgqsahDYVvTH9w7jXbyLeiNdd8XM2w9U/t7y0Ff/9yi0GE44Za4rF2LN9d11TPA
+mRGunUHBcnWEvgJBQl9nJEiU0Zsnvgc/ubhPgXRR4Xq37Z0j4r7g1SgEEzwxA57d
+emyPxgcYxn/eR44/KJ4EBs+lVDR3veyJm+kXQ99b21/+jh5Xos1AnX5iItreGCc=
+-----END CERTIFICATE-----

Dockerfile ADDED Viewed

	@@ -0,0 +1,41 @@

+FROM python:3.10
+COPY ./requirements-gradio.txt /code/requirements-gradio.txt
+# Install system dependencies and create user
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    && useradd -m -u 1000 user \
+    && rm -rf /var/lib/apt/lists/*
+# Install OpenGL and other dependencies required for OpenCV
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# Switch to "user" before installing dependencies
+USER user
+ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    PYTHONPATH=$HOME/app \
+    PYTHONUNBUFFERED=1 \
+    GRADIO_ALLOW_FLAGGING=never \
+    GRADIO_NUM_PORTS=1 \
+    GRADIO_SERVER_NAME=0.0.0.0 \
+    GRADIO_THEME=huggingface \
+    SYSTEM=spaces
+WORKDIR $HOME/app
+# Copy project files as "user" before installing dependencies
+COPY --chown=user . $HOME/app
+COPY --chown=user ./images /home/user/app/images
+# Install dependencies as "user"
+RUN pip install --no-cache-dir --user -e .
+RUN pip install --no-cache-dir --user opencv-python
+RUN pip install --no-cache-dir --user -r /code/requirements-gradio.txt
+RUN ls -l /home/user/app/images/
+CMD ["python", "app.py"]

LICENSE-CODE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2023 DeepSeek
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

LICENSE-MODEL ADDED Viewed

	@@ -0,0 +1,91 @@

+DEEPSEEK LICENSE AGREEMENT
+Version 1.0, 23 October 2023
+Copyright (c) 2023 DeepSeek
+Section I: PREAMBLE
+Large generative models are being widely adopted and used, and have the potential to transform the way individuals conceive and benefit from AI or ML technologies.
+Notwithstanding the current and potential benefits that these artifacts can bring to society at large, there are also concerns about potential misuses of them, either due to their technical limitations or ethical considerations.
+In short, this license strives for both the open and responsible downstream use of the accompanying model. When it comes to the open character, we took inspiration from open source permissive licenses regarding the grant of IP rights. Referring to the downstream responsible use, we added use-based restrictions not permitting the use of the model in very specific scenarios, in order for the licensor to be able to enforce the license in case potential misuses of the Model may occur. At the same time, we strive to promote open and responsible research on generative models for content generation.
+Even though downstream derivative versions of the model could be released under different licensing terms, the latter will always have to include - at minimum - the same use-based restrictions as the ones in the original license (this license). We believe in the intersection between open and responsible AI development; thus, this agreement aims to strike a balance between both in order to enable responsible open-science in the field of AI.
+This License governs the use of the model (and its derivatives) and is informed by the model card associated with the model.
+NOW THEREFORE, You and DeepSeek agree as follows:
+1. Definitions
+"License" means the terms and conditions for use, reproduction, and Distribution as defined in this document.
+"Data" means a collection of information and/or content extracted from the dataset used with the Model, including to train, pretrain, or otherwise evaluate the Model. The Data is not licensed under this License.
+"Output" means the results of operating a Model as embodied in informational content resulting therefrom.
+"Model" means any accompanying machine-learning based assemblies (including checkpoints), consisting of learnt weights, parameters (including optimizer states), corresponding to the model architecture as embodied in the Complementary Material, that have been trained or tuned, in whole or in part on the Data, using the Complementary Material.
+"Derivatives of the Model" means all modifications to the Model, works based on the Model, or any other model which is created or initialized by transfer of patterns of the weights, parameters, activations or output of the Model, to the other model, in order to cause the other model to perform similarly to the Model, including - but not limited to - distillation methods entailing the use of intermediate data representations or methods based on the generation of synthetic data by the Model for training the other model.
+"Complementary Material" means the accompanying source code and scripts used to define, run, load, benchmark or evaluate the Model, and used to prepare data for training or evaluation, if any. This includes any accompanying documentation, tutorials, examples, etc, if any.
+"Distribution" means any transmission, reproduction, publication or other sharing of the Model or Derivatives of the Model to a third party, including providing the Model as a hosted service made available by electronic or other remote means - e.g. API-based or web access.
+"DeepSeek" (or "we") means Beijing DeepSeek Artificial Intelligence Fundamental Technology Research Co., Ltd., Hangzhou DeepSeek Artificial Intelligence Fundamental Technology Research Co., Ltd. and/or any of their affiliates.
+"You" (or "Your") means an individual or Legal Entity exercising permissions granted by this License and/or making use of the Model for whichever purpose and in any field of use, including usage of the Model in an end-use application - e.g. chatbot, translator, etc.
+"Third Parties" means individuals or legal entities that are not under common control with DeepSeek or You.
+Section II: INTELLECTUAL PROPERTY RIGHTS
+Both copyright and patent grants apply to the Model, Derivatives of the Model and Complementary Material. The Model and Derivatives of the Model are subject to additional terms as described in Section III.
+2. Grant of Copyright License. Subject to the terms and conditions of this License, DeepSeek hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare, publicly display, publicly perform, sublicense, and distribute the Complementary Material, the Model, and Derivatives of the Model.
+3. Grant of Patent License. Subject to the terms and conditions of this License and where and as applicable, DeepSeek hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this paragraph) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Model and the Complementary Material, where such license applies only to those patent claims licensable by DeepSeek that are necessarily infringed by its contribution(s). If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Model and/or Complementary Material constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for the Model and/or works shall terminate as of the date such litigation is asserted or filed.
+Section III: CONDITIONS OF USAGE, DISTRIBUTION AND REDISTRIBUTION
+4. Distribution and Redistribution. You may host for Third Party remote access purposes (e.g. software-as-a-service), reproduce and distribute copies of the Model or Derivatives of the Model thereof in any medium, with or without modifications, provided that You meet the following conditions:
+a. Use-based restrictions as referenced in paragraph 5 MUST be included as an enforceable provision by You in any type of legal agreement (e.g. a license) governing the use and/or distribution of the Model or Derivatives of the Model, and You shall give notice to subsequent users You Distribute to, that the Model or Derivatives of the Model are subject to paragraph 5. This provision does not apply to the use of Complementary Material.
+b. You must give any Third Party recipients of the Model or Derivatives of the Model a copy of this License;
+c. You must cause any modified files to carry prominent notices stating that You changed the files;
+d. You must retain all copyright, patent, trademark, and attribution notices excluding those notices that do not pertain to any part of the Model, Derivatives of the Model.
+e. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions - respecting paragraph 4.a. – for use, reproduction, or Distribution of Your modifications, or for any such Derivatives of the Model as a whole, provided Your use, reproduction, and Distribution of the Model otherwise complies with the conditions stated in this License.
+5. Use-based restrictions. The restrictions set forth in Attachment A are considered Use-based restrictions. Therefore You cannot use the Model and the Derivatives of the Model for the specified restricted uses. You may use the Model subject to this License, including only for lawful purposes and in accordance with the License. Use may include creating any content with, finetuning, updating, running, training, evaluating and/or reparametrizing the Model. You shall require all of Your users who use the Model or a Derivative of the Model to comply with the terms of this paragraph (paragraph 5).
+6. The Output You Generate. Except as set forth herein, DeepSeek claims no rights in the Output You generate using the Model. You are accountable for the Output you generate and its subsequent uses. No use of the output can contravene any provision as stated in the License.
+Section IV: OTHER PROVISIONS
+7. Updates and Runtime Restrictions. To the maximum extent permitted by law, DeepSeek reserves the right to restrict (remotely or otherwise) usage of the Model in violation of this License.
+8. Trademarks and related. Nothing in this License permits You to make use of DeepSeek’ trademarks, trade names, logos or to otherwise suggest endorsement or misrepresent the relationship between the parties; and any rights not expressly granted herein are reserved by DeepSeek.
+9. Personal information, IP rights and related. This Model may contain personal information and works with IP rights. You commit to complying with applicable laws and regulations in the handling of personal information and the use of such works. Please note that DeepSeek's license granted to you to use the Model does not imply that you have obtained a legitimate basis for processing the related information or works. As an independent personal information processor and IP rights user, you need to ensure full compliance with relevant legal and regulatory requirements when handling personal information and works with IP rights that may be contained in the Model, and are willing to assume solely any risks and consequences that may arise from that.
+10. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, DeepSeek provides the Model and the Complementary Material on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Model, Derivatives of the Model, and the Complementary Material and assume any risks associated with Your exercise of permissions under this License.
+11. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall DeepSeek be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Model and the Complementary Material (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if DeepSeek has been advised of the possibility of such damages.
+12. Accepting Warranty or Additional Liability. While redistributing the Model, Derivatives of the Model and the Complementary Material thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of DeepSeek, and only if You agree to indemnify, defend, and hold DeepSeek harmless for any liability incurred by, or claims asserted against, DeepSeek by reason of your accepting any such warranty or additional liability.
+13. If any provision of this License is held to be invalid, illegal or unenforceable, the remaining provisions shall be unaffected thereby and remain valid as if such provision had not been set forth herein.
+14. Governing Law and Jurisdiction. This agreement will be governed and construed under PRC laws without regard to choice of law principles, and the UN Convention on Contracts for the International Sale of Goods does not apply to this agreement. The courts located in the domicile of Hangzhou DeepSeek Artificial Intelligence Fundamental Technology Research Co., Ltd. shall have exclusive jurisdiction of any dispute arising out of this agreement.
+END OF TERMS AND CONDITIONS
+Attachment A
+Use Restrictions
+You agree not to use the Model or Derivatives of the Model:
+-	In any way that violates any applicable national or international law or regulation or infringes upon the lawful rights and interests of any third party;
+-	For military use in any way;
+-	For the purpose of exploiting, harming or attempting to exploit or harm minors in any way;
+-	To generate or disseminate verifiably false information and/or content with the purpose of harming others;
+-	To generate or disseminate inappropriate content subject to applicable regulatory requirements;
+-	To generate or disseminate personal identifiable information without due authorization or for unreasonable use;
+-	To defame, disparage or otherwise harass others;
+-	For fully automated decision making that adversely impacts an individual’s legal rights or otherwise creates or modifies a binding, enforceable obligation;
+-	For any use intended to or which has the effect of discriminating against or harming individuals or groups based on online or offline social behavior or known or predicted personal or personality characteristics;
+-	To exploit any of the vulnerabilities of a specific group of persons based on their age, social, physical or mental characteristics, in order to materially distort the behavior of a person pertaining to that group in a manner that causes or is likely to cause that person or another person physical or psychological harm;
+-	For any use intended to or which has the effect of discriminating against individuals or groups based on legally protected characteristics or categories.

Makefile ADDED Viewed

	@@ -0,0 +1,99 @@

+print-%  : ; @echo $* = $($*)
+PROJECT_NAME   = Janus
+COPYRIGHT      = "DeepSeek."
+PROJECT_PATH   = janus
+SHELL          = /bin/bash
+SOURCE_FOLDERS = janus
+PYTHON_FILES   = $(shell find $(SOURCE_FOLDERS) -type f -name "*.py" -o -name "*.pyi")  inference.py
+COMMIT_HASH    = $(shell git log -1 --format=%h)
+PATH           := $(HOME)/go/bin:$(PATH)
+PYTHON         ?= $(shell command -v python3 || command -v python)
+PYTESTOPTS     ?=
+.PHONY: default
+default: install
+# Tools Installation
+check_pip_install = $(PYTHON) -m pip show $(1) &>/dev/null || (cd && $(PYTHON) -m pip install $(1) --upgrade)
+check_pip_install_extra = $(PYTHON) -m pip show $(1) &>/dev/null || (cd && $(PYTHON) -m pip install $(2) --upgrade)
+pylint-install:
+	$(call check_pip_install_extra,pylint,pylint[spelling])
+	$(call check_pip_install,pyenchant)
+flake8-install:
+	$(call check_pip_install,flake8)
+	$(call check_pip_install,flake8-bugbear)
+	$(call check_pip_install,flake8-comprehensions)
+	$(call check_pip_install,flake8-docstrings)
+	$(call check_pip_install,flake8-pyi)
+	$(call check_pip_install,flake8-simplify)
+py-format-install:
+	$(call check_pip_install,isort)
+	$(call check_pip_install_extra,black,black[jupyter])
+ruff-install:
+	$(call check_pip_install,ruff)
+mypy-install:
+	$(call check_pip_install,mypy)
+pre-commit-install:
+	$(call check_pip_install,pre-commit)
+	$(PYTHON) -m pre_commit install --install-hooks
+go-install:
+	# requires go >= 1.16
+	command -v go || (sudo apt-get install -y golang && sudo ln -sf /usr/lib/go/bin/go /usr/bin/go)
+addlicense-install: go-install
+	command -v addlicense || go install github.com/google/addlicense@latest
+addlicense: addlicense-install
+	addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l mit -y 2023-$(shell date +"%Y") -check $(SOURCE_FOLDERS)
+# Python linters
+pylint: pylint-install
+	$(PYTHON) -m pylint $(PROJECT_PATH)
+flake8: flake8-install
+	$(PYTHON) -m flake8 --count --show-source --statistics
+py-format: py-format-install
+	$(PYTHON) -m isort --project $(PROJECT_PATH) --check $(PYTHON_FILES) && \
+	$(PYTHON) -m black --check $(PYTHON_FILES)
+black-format: py-format-install
+	$(PYTHON) -m black --check $(PYTHON_FILES)
+ruff: ruff-install
+	$(PYTHON) -m ruff check .
+ruff-fix: ruff-install
+	$(PYTHON) -m ruff check . --fix --exit-non-zero-on-fix
+mypy: mypy-install
+	$(PYTHON) -m mypy $(PROJECT_PATH) --install-types --non-interactive
+pre-commit: pre-commit-install
+	$(PYTHON) -m pre_commit run --all-files
+# Utility functions
+lint: ruff flake8 py-format mypy pylint addlicense
+format: py-format-install ruff-install addlicense-install
+	$(PYTHON) -m isort --project $(PROJECT_PATH) $(PYTHON_FILES)
+	$(PYTHON) -m black $(PYTHON_FILES)
+	addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l mit -y 2023-$(shell date +"%Y") $(SOURCE_FOLDERS)  inference.py
+clean-py:
+	find . -type f -name  '*.py[co]' -delete
+	find . -depth -type d -name "__pycache__" -exec rm -r "{}" +
+	find . -depth -type d -name ".ruff_cache" -exec rm -r "{}" +
+	find . -depth -type d -name ".mypy_cache" -exec rm -r "{}" +
+clean: clean-py

app.py ADDED Viewed

	@@ -0,0 +1,398 @@

+import gradio as gr
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.utils.io import load_pil_images
+from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip, AttentionGuidedCAMLLaVA
+from demo.model_utils import Clip_Utils, Janus_Utils, LLaVA_Utils, add_title_to_image
+import numpy as np
+import matplotlib.pyplot as plt
+import gc
+import spaces
+from PIL import Image
+def set_seed(model_seed = 42):
+    torch.manual_seed(model_seed)
+    np.random.seed(model_seed)
+    torch.cuda.manual_seed(model_seed) if torch.cuda.is_available() else None
+set_seed()
+clip_utils = Clip_Utils()
+clip_utils.init_Clip()
+model_utils, vl_gpt, tokenizer = None, None, None
+model_name = "Clip"
+def clean():
+    global model_utils, vl_gpt, tokenizer, clip_utils
+    # Move models to CPU first (prevents CUDA references)
+    if 'vl_gpt' in globals() and vl_gpt is not None:
+        vl_gpt.to("cpu")
+    if 'clip_utils' in globals() and clip_utils is not None:
+        del clip_utils
+    # Delete all references
+    del model_utils, vl_gpt, tokenizer
+    model_utils, vl_gpt, tokenizer, clip_utils = None, None, None, None
+    gc.collect()
+    # Empty CUDA cache
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()  # Frees inter-process CUDA memory
+    # Empty MacOS Metal backend (if using Apple Silicon)
+    if torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+# Multimodal Understanding function
+@spaces.GPU(duration=120)
+def multimodal_understanding(model_type,
+                             saliency_map_method,
+                             visual_pooling_method,
+                             image, question, seed, top_p, temperature, target_token_idx,
+                             visualization_layer_min, visualization_layer_max, focus, response_type):
+    # Clear CUDA cache before generating
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+    # set seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed) if torch.cuda.is_available() else None
+    input_text_decoded = ""
+    answer = ""
+    if model_name == "Clip":
+        inputs = clip_utils.prepare_inputs([question], image)
+        if saliency_map_method == "GradCAM":
+            # Generate Grad-CAM
+            all_layers = [layer.layer_norm1 for layer in clip_utils.model.vision_model.encoder.layers]
+            if visualization_layers_min.value != visualization_layers_max.value:
+                target_layers = all_layers[visualization_layer_min-1 : visualization_layer_max-1]
+            else:
+                target_layers = [all_layers[visualization_layer_min-1]]
+            grad_cam = AttentionGuidedCAMClip(clip_utils.model, target_layers)
+            cam, outputs, grid_size = grad_cam.generate_cam(inputs, class_idx=0, visual_pooling_method=visual_pooling_method)
+            cam = cam.to("cpu")
+            cam = [generate_gradcam(cam, image, size=(224, 224))]
+            grad_cam.remove_hooks()
+            target_token_decoded = ""
+    else:
+        for param in vl_gpt.parameters():
+            param.requires_grad = True
+        prepare_inputs = model_utils.prepare_inputs(question, image)
+        if response_type == "answer + visualization":
+            if model_name.split('-')[0] == "Janus":
+                inputs_embeds = model_utils.generate_inputs_embeddings(prepare_inputs)
+                outputs = model_utils.generate_outputs(inputs_embeds, prepare_inputs, temperature, top_p)
+            else:
+                outputs = model_utils.generate_outputs(prepare_inputs, temperature, top_p)
+            sequences = outputs.sequences.cpu().tolist()
+            answer = tokenizer.decode(sequences[0], skip_special_tokens=True)
+            attention_raw = outputs.attentions
+            print("answer generated")
+        input_ids = prepare_inputs.input_ids[0].cpu().tolist()
+        input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
+        start=620 if model_name.split('-')[0] == "Janus" else 512
+        if saliency_map_method == "GradCAM":
+            # target_layers = vl_gpt.vision_model.vision_tower.blocks
+            if focus == "Visual Encoder":
+                all_layers = [block.norm1 for block in vl_gpt.vision_model.vision_tower.blocks]
+            else:
+                all_layers = [layer.self_attn for layer in vl_gpt.language_model.model.layers]
+            if visualization_layers_min.value != visualization_layers_max.value:
+                target_layers = all_layers[visualization_layer_min-1 : visualization_layer_max-1]
+            else:
+                target_layers = [all_layers[visualization_layer_min-1]]
+            if model_name.split('-')[0] == "Janus":
+                gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
+            elif model_name.split('-')[0] == "LLaVA":
+                gradcam = AttentionGuidedCAMLLaVA(vl_gpt, target_layers)
+            cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
+            gradcam.remove_hooks()
+            if focus == "Visual Encoder":
+                cam_grid = cam_tensors.reshape(grid_size, grid_size)
+                cam = [generate_gradcam(cam_grid, image)]
+            else:
+                if target_token_idx != -1:
+                    input_text_decoded = input_ids_decoded[start + target_token_idx]
+                    for i, cam_tensor in enumerate(cam_tensors):
+                        if i == target_token_idx:
+                            cam_grid = cam_tensor.reshape(grid_size, grid_size)
+                            cam_i = generate_gradcam(cam_grid, image)
+                            cam = [add_title_to_image(cam_i, input_text_decoded)]
+                            break
+                else:
+                    cam = []
+                    for i, cam_tensor in enumerate(cam_tensors):
+                        cam_grid = cam_tensor.reshape(24, 24)
+                        cam_i = generate_gradcam(cam_grid, image)
+                        cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
+                        cam.append(cam_i)
+    return answer, cam, input_text_decoded
+# Gradio interface
+def model_slider_change(model_type):
+    global model_utils, vl_gpt, tokenizer, clip_utils, model_name
+    model_name = model_type
+    if model_type == "Clip":
+        clean()
+        set_seed()
+        clip_utils = Clip_Utils()
+        clip_utils.init_Clip()
+        res = (
+            gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type"),
+            gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max"),
+            gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus"),
+            gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
+        )
+        return res
+    elif model_type.split('-')[0] == "Janus":
+        clean()
+        set_seed()
+        model_utils = Janus_Utils()
+        vl_gpt, tokenizer = model_utils.init_Janus(model_type.split('-')[-1])
+        res = (
+            gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
+            gr.Dropdown(choices=["Visual Encoder", "Language Model"], value="Visual Encoder", label="focus"),
+            gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
+        )
+        return res
+    elif model_type.split('-')[0] == "LLaVA":
+        clean()
+        set_seed()
+        model_utils = LLaVA_Utils()
+        vl_gpt, tokenizer = model_utils.init_LLaVA()
+        res = (
+            gr.Dropdown(choices=["Visualization only", "answer + visualization"], value="Visualization only", label="response_type"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
+            gr.Dropdown(choices=["Language Model"], value="Language Model", label="focus"),
+            gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
+        )
+        return res
+def focus_change(focus):
+    global model_name
+    if model_name == "Clip":
+        res = (
+                gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
+                gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
+                gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
+            )
+        return res
+    if focus == "Language Model":
+        if response_type.value == "answer + visualization":
+            res = (
+                gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
+                gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers min"),
+                gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers max")
+            )
+            return res
+        else:
+            res = (
+                gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
+                gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers min"),
+                gr.Slider(minimum=1, maximum=24, value=8, step=1, label="visualization layers max")
+            )
+            return res
+    else:
+        res = (
+            gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max")
+        )
+        return res
+with gr.Blocks() as demo:
+    gr.Markdown(value="# Multimodal Understanding")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image()
+            saliency_map_output = gr.Gallery(label="Saliency Map", height=300, columns=1)
+        with gr.Column():
+            model_selector = gr.Dropdown(choices=["Clip", "Janus-1B", "Janus-7B", "LLaVA-1.5-7B"], value="Clip", label="model")
+            response_type = gr.Dropdown(choices=["Visualization only"], value="Visualization only", label="response_type")
+            focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
+            saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
+            visual_pooling_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
+            visualization_layers_min = gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min")
+            visualization_layers_max = gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
+            question_input = gr.Textbox(label="Question")
+            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
+            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
+            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
+            target_token_idx = gr.Number(label="target_token_idx (-1 means all)", precision=0, value=-1)
+        model_selector.change(
+            fn=model_slider_change,
+            inputs=model_selector,
+            outputs=[
+                response_type,
+                visualization_layers_min,
+                visualization_layers_max,
+                focus,
+                saliency_map_method
+            ]
+        )
+        focus.change(
+            fn = focus_change,
+            inputs = focus,
+            outputs=[
+                saliency_map_method,
+                visualization_layers_min,
+                visualization_layers_max,
+            ]
+        )
+        # response_type.change(
+        #     fn = response_type_change,
+        #     inputs = response_type,
+        #     outputs = [saliency_map_method]
+        # )
+    understanding_button = gr.Button("Chat")
+    understanding_output = gr.Textbox(label="Answer")
+    understanding_target_token_decoded_output = gr.Textbox(label="Target Token Decoded")
+    examples_inpainting = gr.Examples(
+        label="Multimodal Understanding examples",
+        examples=[
+            [
+                "What is the approximate global smartphone market share of Samsung?",
+                "images/PieChart.png"
+            ],
+            [
+                "What is the average internet speed in Japan?",
+                "images/BarChart.png"
+            ],
+            [
+                "What was the average price of coffee beans in October 2019?",
+                "images/AreaChart.png"
+            ],
+            [
+                "Which city's metro system has the largest number of stations?",
+                "images/BubbleChart.png"
+            ],
+            [
+                "True/False: In 2020, the unemployment rate for Washington (WA) was higher than that of Wisconsin (WI).",
+                "images/Choropleth_New.png"
+            ],
+            [
+                "What distance have customers traveled in the taxi the most?",
+                "images/Histogram.png"
+            ],
+            [
+                "What was the price of a barrel of oil in February 2020?",
+                "images/LineChart.png"
+            ],
+            [
+                "True/False: eBay is nested in the Software category.",
+                "images/TreeMap.png"
+            ],
+            [
+                "True/False: There is a negative linear relationship between the height and the weight of the 85 males.",
+                "images/Scatterplot.png"
+            ],
+            [
+                "Which country has the lowest proportion of Gold medals?",
+                "images/Stacked100.png"
+            ],
+            [
+                "What was the ratio of girls named 'Isla' to girls named 'Amelia' in 2012 in the UK?",
+                "images/StackedArea.png"
+            ],
+            [
+                "What is the cost of peanuts in Seoul?",
+                "images/StackedBar.png"
+            ],
+            [
+                "Where is the dog? Left or Right?",
+                "images/cat_dog.png"
+            ]
+            # [
+            #     "explain this meme",
+            #     "images/doge.png",
+            # ],
+            # [
+            #     "Convert the formula into latex code.",
+            #     "images/equation.png",
+            # ],
+        ],
+        inputs=[question_input, image_input],
+    )
+    understanding_button.click(
+        multimodal_understanding,
+        inputs=[model_selector, saliency_map_method, visual_pooling_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
+                visualization_layers_min, visualization_layers_max, focus, response_type],
+        outputs=[understanding_output, saliency_map_output, understanding_target_token_decoded_output]
+    )
+demo.launch(share=True)
+# demo.queue(concurrency_count=1, max_size=10).launch(server_name="0.0.0.0", server_port=37906, root_path="/path")

demo/Janus_colab_demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

demo/app.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import gradio as gr
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from PIL import Image
+import numpy as np
+# Load model and processor
+model_path = "deepseek-ai/Janus-1.3B"
+config = AutoConfig.from_pretrained(model_path)
+language_config = config.language_config
+language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
+                                             language_config=language_config,
+                                             trust_remote_code=True)
+vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
+vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Multimodal Understanding function
+@torch.inference_mode()
+# Multimodal Understanding function
+def multimodal_understanding(image, question, seed, top_p, temperature):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    # set seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    conversation = [
+        {
+            "role": "User",
+            "content": f"<image_placeholder>\n{question}",
+            "images": [image],
+        },
+        {"role": "Assistant", "content": ""},
+    ]
+    pil_images = [Image.fromarray(image)]
+    prepare_inputs = vl_chat_processor(
+        conversations=conversation, images=pil_images, force_batchify=True
+    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    outputs = vl_gpt.language_model.generate(
+        inputs_embeds=inputs_embeds,
+        attention_mask=prepare_inputs.attention_mask,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False if temperature == 0 else True,
+        use_cache=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+    return answer
+def generate(input_ids,
+             width,
+             height,
+             temperature: float = 1,
+             parallel_size: int = 5,
+             cfg_weight: float = 5,
+             image_token_num_per_image: int = 576,
+             patch_size: int = 16):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
+    for i in range(parallel_size * 2):
+        tokens[i, :] = input_ids
+        if i % 2 != 0:
+            tokens[i, 1:-1] = vl_chat_processor.pad_id
+    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
+    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)
+    pkv = None
+    for i in range(image_token_num_per_image):
+        outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
+                                             use_cache=True,
+                                             past_key_values=pkv)
+        pkv = outputs.past_key_values
+        hidden_states = outputs.last_hidden_state
+        logits = vl_gpt.gen_head(hidden_states[:, -1, :])
+        logit_cond = logits[0::2, :]
+        logit_uncond = logits[1::2, :]
+        logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
+        probs = torch.softmax(logits / temperature, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        generated_tokens[:, i] = next_token.squeeze(dim=-1)
+        next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+        img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
+        inputs_embeds = img_embeds.unsqueeze(dim=1)
+    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
+                                                 shape=[parallel_size, 8, width // patch_size, height // patch_size])
+    return generated_tokens.to(dtype=torch.int), patches
+def unpack(dec, width, height, parallel_size=5):
+    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+    dec = np.clip((dec + 1) / 2 * 255, 0, 255)
+    visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
+    visual_img[:, :, :] = dec
+    return visual_img
+@torch.inference_mode()
+def generate_image(prompt,
+                   seed=None,
+                   guidance=5):
+    # Clear CUDA cache and avoid tracking gradients
+    torch.cuda.empty_cache()
+    # Set the seed for reproducible results
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        np.random.seed(seed)
+    width = 384
+    height = 384
+    parallel_size = 5
+    with torch.no_grad():
+        messages = [{'role': 'User', 'content': prompt},
+                    {'role': 'Assistant', 'content': ''}]
+        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+                                                                   sft_format=vl_chat_processor.sft_format,
+                                                                   system_prompt='')
+        text = text + vl_chat_processor.image_start_tag
+        input_ids = torch.LongTensor(tokenizer.encode(text))
+        output, patches = generate(input_ids,
+                                   width // 16 * 16,
+                                   height // 16 * 16,
+                                   cfg_weight=guidance,
+                                   parallel_size=parallel_size)
+        images = unpack(patches,
+                        width // 16 * 16,
+                        height // 16 * 16)
+        return [Image.fromarray(images[i]).resize((1024, 1024), Image.LANCZOS) for i in range(parallel_size)]
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(value="# Multimodal Understanding")
+    # with gr.Row():
+    with gr.Row():
+        image_input = gr.Image()
+        with gr.Column():
+            question_input = gr.Textbox(label="Question")
+            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
+            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
+            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
+    understanding_button = gr.Button("Chat")
+    understanding_output = gr.Textbox(label="Response")
+    examples_inpainting = gr.Examples(
+        label="Multimodal Understanding examples",
+        examples=[
+            [
+                "explain this meme",
+                "images/doge.png",
+            ],
+            [
+                "Convert the formula into latex code.",
+                "images/equation.png",
+            ],
+        ],
+        inputs=[question_input, image_input],
+    )
+    gr.Markdown(value="# Text-to-Image Generation")
+    with gr.Row():
+        cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
+    prompt_input = gr.Textbox(label="Prompt")
+    seed_input = gr.Number(label="Seed (Optional)", precision=0, value=12345)
+    generation_button = gr.Button("Generate Images")
+    image_output = gr.Gallery(label="Generated Images", columns=2, rows=2, height=300)
+    examples_t2i = gr.Examples(
+        label="Text to image generation examples. (Tips for designing prompts: Adding description like 'digital art' at the end of the prompt or writing the prompt in more detail can help produce better images!)",
+        examples=[
+            "Master shifu racoon wearing drip attire as a street gangster.",
+            "A cute and adorable baby fox with big brown eyes, autumn leaves in the background enchanting,immortal,fluffy, shiny mane,Petals,fairyism,unreal engine 5 and Octane Render,highly detailed, photorealistic, cinematic, natural colors.",
+            "The image features an intricately designed eye set against a circular backdrop adorned with ornate swirl patterns that evoke both realism and surrealism. At the center of attention is a strikingly vivid blue iris surrounded by delicate veins radiating outward from the pupil to create depth and intensity. The eyelashes are long and dark, casting subtle shadows on the skin around them which appears smooth yet slightly textured as if aged or weathered over time.\n\nAbove the eye, there's a stone-like structure resembling part of classical architecture, adding layers of mystery and timeless elegance to the composition. This architectural element contrasts sharply but harmoniously with the organic curves surrounding it. Below the eye lies another decorative motif reminiscent of baroque artistry, further enhancing the overall sense of eternity encapsulated within each meticulously crafted detail. \n\nOverall, the atmosphere exudes a mysterious aura intertwined seamlessly with elements suggesting timelessness, achieved through the juxtaposition of realistic textures and surreal artistic flourishes. Each component\u2014from the intricate designs framing the eye to the ancient-looking stone piece above\u2014contributes uniquely towards creating a visually captivating tableau imbued with enigmatic allure.",
+        ],
+        inputs=prompt_input,
+    )
+    understanding_button.click(
+        multimodal_understanding,
+        inputs=[image_input, question_input, und_seed_input, top_p, temperature],
+        outputs=understanding_output
+    )
+    generation_button.click(
+        fn=generate_image,
+        inputs=[prompt_input, seed_input, cfg_weight_input],
+        outputs=image_output
+    )
+demo.launch(share=True)

demo/app_janusflow.py ADDED Viewed

	@@ -0,0 +1,247 @@

+import gradio as gr
+import torch
+from janus.janusflow.models import MultiModalityCausalLM, VLChatProcessor
+from PIL import Image
+from diffusers.models import AutoencoderKL
+import numpy as np
+cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+# Load model and processor
+model_path = "deepseek-ai/JanusFlow-1.3B"
+vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+vl_gpt = MultiModalityCausalLM.from_pretrained(model_path)
+vl_gpt = vl_gpt.to(torch.bfloat16).to(cuda_device).eval()
+# remember to use bfloat16 dtype, this vae doesn't work with fp16
+vae = AutoencoderKL.from_pretrained("stabilityai/sdxl-vae")
+vae = vae.to(torch.bfloat16).to(cuda_device).eval()
+# Multimodal Understanding function
+@torch.inference_mode()
+# Multimodal Understanding function
+def multimodal_understanding(image, question, seed, top_p, temperature):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    # set seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    conversation = [
+        {
+            "role": "User",
+            "content": f"<image_placeholder>\n{question}",
+            "images": [image],
+        },
+        {"role": "Assistant", "content": ""},
+    ]
+    pil_images = [Image.fromarray(image)]
+    prepare_inputs = vl_chat_processor(
+        conversations=conversation, images=pil_images, force_batchify=True
+    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    outputs = vl_gpt.language_model.generate(
+        inputs_embeds=inputs_embeds,
+        attention_mask=prepare_inputs.attention_mask,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False if temperature == 0 else True,
+        use_cache=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+    return answer
+@torch.inference_mode()
+def generate(
+    input_ids,
+    cfg_weight: float = 2.0,
+    num_inference_steps: int = 30
+):
+    # we generate 5 images at a time, *2 for CFG
+    tokens = torch.stack([input_ids] * 10).cuda()
+    tokens[5:, 1:] = vl_chat_processor.pad_id
+    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
+    print(inputs_embeds.shape)
+    # we remove the last <bog> token and replace it with t_emb later
+    inputs_embeds = inputs_embeds[:, :-1, :]
+    # generate with rectified flow ode
+    # step 1: encode with vision_gen_enc
+    z = torch.randn((5, 4, 48, 48), dtype=torch.bfloat16).cuda()
+    dt = 1.0 / num_inference_steps
+    dt = torch.zeros_like(z).cuda().to(torch.bfloat16) + dt
+    # step 2: run ode
+    attention_mask = torch.ones((10, inputs_embeds.shape[1]+577)).to(vl_gpt.device)
+    attention_mask[5:, 1:inputs_embeds.shape[1]] = 0
+    attention_mask = attention_mask.int()
+    for step in range(num_inference_steps):
+        # prepare inputs for the llm
+        z_input = torch.cat([z, z], dim=0) # for cfg
+        t = step / num_inference_steps * 1000.
+        t = torch.tensor([t] * z_input.shape[0]).to(dt)
+        z_enc = vl_gpt.vision_gen_enc_model(z_input, t)
+        z_emb, t_emb, hs = z_enc[0], z_enc[1], z_enc[2]
+        z_emb = z_emb.view(z_emb.shape[0], z_emb.shape[1], -1).permute(0, 2, 1)
+        z_emb = vl_gpt.vision_gen_enc_aligner(z_emb)
+        llm_emb = torch.cat([inputs_embeds, t_emb.unsqueeze(1), z_emb], dim=1)
+        # input to the llm
+        # we apply attention mask for CFG: 1 for tokens that are not masked, 0 for tokens that are masked.
+        if step == 0:
+            outputs = vl_gpt.language_model.model(inputs_embeds=llm_emb,
+                                             use_cache=True,
+                                             attention_mask=attention_mask,
+                                             past_key_values=None)
+            past_key_values = []
+            for kv_cache in past_key_values:
+                k, v = kv_cache[0], kv_cache[1]
+                past_key_values.append((k[:, :, :inputs_embeds.shape[1], :], v[:, :, :inputs_embeds.shape[1], :]))
+            past_key_values = tuple(past_key_values)
+        else:
+            outputs = vl_gpt.language_model.model(inputs_embeds=llm_emb,
+                                             use_cache=True,
+                                             attention_mask=attention_mask,
+                                             past_key_values=past_key_values)
+        hidden_states = outputs.last_hidden_state
+        # transform hidden_states back to v
+        hidden_states = vl_gpt.vision_gen_dec_aligner(vl_gpt.vision_gen_dec_aligner_norm(hidden_states[:, -576:, :]))
+        hidden_states = hidden_states.reshape(z_emb.shape[0], 24, 24, 768).permute(0, 3, 1, 2)
+        v = vl_gpt.vision_gen_dec_model(hidden_states, hs, t_emb)
+        v_cond, v_uncond = torch.chunk(v, 2)
+        v = cfg_weight * v_cond - (cfg_weight-1.) * v_uncond
+        z = z + dt * v
+    # step 3: decode with vision_gen_dec and sdxl vae
+    decoded_image = vae.decode(z / vae.config.scaling_factor).sample
+    images = decoded_image.float().clip_(-1., 1.).permute(0,2,3,1).cpu().numpy()
+    images = ((images+1) / 2. * 255).astype(np.uint8)
+    return images
+def unpack(dec, width, height, parallel_size=5):
+    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+    dec = np.clip((dec + 1) / 2 * 255, 0, 255)
+    visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
+    visual_img[:, :, :] = dec
+    return visual_img
+@torch.inference_mode()
+def generate_image(prompt,
+                   seed=None,
+                   guidance=5,
+                   num_inference_steps=30):
+    # Clear CUDA cache and avoid tracking gradients
+    torch.cuda.empty_cache()
+    # Set the seed for reproducible results
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        np.random.seed(seed)
+    with torch.no_grad():
+        messages = [{'role': 'User', 'content': prompt},
+                    {'role': 'Assistant', 'content': ''}]
+        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+                                                                   sft_format=vl_chat_processor.sft_format,
+                                                                   system_prompt='')
+        text = text + vl_chat_processor.image_start_tag
+        input_ids = torch.LongTensor(tokenizer.encode(text))
+        images = generate(input_ids,
+                                   cfg_weight=guidance,
+                                   num_inference_steps=num_inference_steps)
+        return [Image.fromarray(images[i]).resize((1024, 1024), Image.LANCZOS) for i in range(images.shape[0])]
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(value="# Multimodal Understanding")
+    # with gr.Row():
+    with gr.Row():
+        image_input = gr.Image()
+        with gr.Column():
+            question_input = gr.Textbox(label="Question")
+            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
+            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
+            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
+    understanding_button = gr.Button("Chat")
+    understanding_output = gr.Textbox(label="Response")
+    examples_inpainting = gr.Examples(
+        label="Multimodal Understanding examples",
+        examples=[
+            [
+                "explain this meme",
+                "./images/doge.png",
+            ],
+            [
+                "Convert the formula into latex code.",
+                "./images/equation.png",
+            ],
+        ],
+        inputs=[question_input, image_input],
+    )
+    gr.Markdown(value="# Text-to-Image Generation")
+    with gr.Row():
+        cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=2, step=0.5, label="CFG Weight")
+        step_input = gr.Slider(minimum=1, maximum=50, value=30, step=1, label="Number of Inference Steps")
+    prompt_input = gr.Textbox(label="Prompt")
+    seed_input = gr.Number(label="Seed (Optional)", precision=0, value=12345)
+    generation_button = gr.Button("Generate Images")
+    image_output = gr.Gallery(label="Generated Images", columns=2, rows=2, height=300)
+    examples_t2i = gr.Examples(
+        label="Text to image generation examples.",
+        examples=[
+            "Master shifu racoon wearing drip attire as a street gangster.",
+            "A cute and adorable baby fox with big brown eyes, autumn leaves in the background enchanting,immortal,fluffy, shiny mane,Petals,fairyism,unreal engine 5 and Octane Render,highly detailed, photorealistic, cinematic, natural colors.",
+            "The image features an intricately designed eye set against a circular backdrop adorned with ornate swirl patterns that evoke both realism and surrealism. At the center of attention is a strikingly vivid blue iris surrounded by delicate veins radiating outward from the pupil to create depth and intensity. The eyelashes are long and dark, casting subtle shadows on the skin around them which appears smooth yet slightly textured as if aged or weathered over time.\n\nAbove the eye, there's a stone-like structure resembling part of classical architecture, adding layers of mystery and timeless elegance to the composition. This architectural element contrasts sharply but harmoniously with the organic curves surrounding it. Below the eye lies another decorative motif reminiscent of baroque artistry, further enhancing the overall sense of eternity encapsulated within each meticulously crafted detail. \n\nOverall, the atmosphere exudes a mysterious aura intertwined seamlessly with elements suggesting timelessness, achieved through the juxtaposition of realistic textures and surreal artistic flourishes. Each component\u2014from the intricate designs framing the eye to the ancient-looking stone piece above\u2014contributes uniquely towards creating a visually captivating tableau imbued with enigmatic allure.",
+        ],
+        inputs=prompt_input,
+    )
+    understanding_button.click(
+        multimodal_understanding,
+        inputs=[image_input, question_input, und_seed_input, top_p, temperature],
+        outputs=understanding_output
+    )
+    generation_button.click(
+        fn=generate_image,
+        inputs=[prompt_input, seed_input, cfg_weight_input, step_input],
+        outputs=image_output
+    )
+demo.launch(share=True)

demo/app_januspro.py ADDED Viewed

	@@ -0,0 +1,294 @@

+import gradio as gr
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.utils.io import load_pil_images
+from demo.cam import generate_gradcam, GradCAM, AttentionGuidedCAM
+from PIL import Image
+from einops import rearrange
+import numpy as np
+import os
+import time
+# import spaces  # Import spaces for ZeroGPU compatibility
+# Load model and processor
+# model_path = "deepseek-ai/Janus-Pro-7B"
+model_path = "deepseek-ai/Janus-Pro-1B"
+config = AutoConfig.from_pretrained(model_path)
+language_config = config.language_config
+language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
+                                             language_config=language_config,
+                                             trust_remote_code=True)
+dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16
+# dtype = torch.bfloat32 if torch.cuda.is_available() else torch.float32
+if torch.cuda.is_available():
+    vl_gpt = vl_gpt.to(dtype).cuda()
+else:
+    # vl_gpt = vl_gpt.to(torch.float16)
+    torch.set_default_device("mps")
+    vl_gpt = vl_gpt.to(dtype)
+vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+cuda_device = 'cuda' if torch.cuda.is_available() else 'mps'
+# @torch.inference_mode() # cancel inference, for gradcam
+# @spaces.GPU(duration=120)
+# Multimodal Understanding function
+def multimodal_understanding(image, question, seed, top_p, temperature, target_token_idx):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    for param in vl_gpt.parameters():
+        param.requires_grad = True
+    # set seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    # Get the last transformer block of the Vision Transformer (ViT)
+    conversation = [
+        {
+            "role": "<|User|>",
+            "content": f"<image_placeholder>\n{question}",
+            "images": [image],
+        },
+        {"role": "<|Assistant|>", "content": ""},
+    ]
+    pil_images = [Image.fromarray(image)]
+    prepare_inputs = vl_chat_processor(
+        conversations=conversation, images=pil_images, force_batchify=True
+    ).to(cuda_device, dtype=dtype)
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    # print("prepared inputs", prepare_inputs)
+    outputs = vl_gpt.language_model.generate(
+        inputs_embeds=inputs_embeds,
+        attention_mask=prepare_inputs.attention_mask,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False if temperature == 0 else True,
+        use_cache=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+    print("answer generated")
+    target_layer = vl_gpt.vision_model.vision_tower.blocks
+    gradcam = AttentionGuidedCAM(vl_gpt, target_layer)
+    cam_tensor, output, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx)
+    cam_grid = cam_tensor.reshape(grid_size, grid_size)
+    cam = generate_gradcam(cam_grid, image)
+    output_arr = output.logits.detach().to(float).to("cpu").numpy()
+    predicted_ids = np.argmax(output_arr, axis=-1) # [1, num_tokens]
+    predicted_ids = predicted_ids.squeeze(0) # [num_tokens]
+    target_token_decoded = tokenizer.decode(predicted_ids[target_token_idx].tolist())
+    return answer, [cam], target_token_decoded
+def generate(input_ids,
+             width,
+             height,
+             temperature: float = 1,
+             parallel_size: int = 5,
+             cfg_weight: float = 5,
+             image_token_num_per_image: int = 576,
+             patch_size: int = 16):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
+    for i in range(parallel_size * 2):
+        tokens[i, :] = input_ids
+        if i % 2 != 0:
+            tokens[i, 1:-1] = vl_chat_processor.pad_id
+    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
+    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)
+    pkv = None
+    for i in range(image_token_num_per_image):
+        with torch.no_grad():
+            outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds,
+                                                use_cache=True,
+                                                past_key_values=pkv)
+            pkv = outputs.past_key_values
+            hidden_states = outputs.last_hidden_state
+            logits = vl_gpt.gen_head(hidden_states[:, -1, :])
+            logit_cond = logits[0::2, :]
+            logit_uncond = logits[1::2, :]
+            logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
+            probs = torch.softmax(logits / temperature, dim=-1)
+            next_token = torch.multinomial(probs, num_samples=1)
+            generated_tokens[:, i] = next_token.squeeze(dim=-1)
+            next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+            img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
+            inputs_embeds = img_embeds.unsqueeze(dim=1)
+    patches = vl_gpt.gen_vision_model.decode_code(generated_tokens.to(dtype=torch.int),
+                                                 shape=[parallel_size, 8, width // patch_size, height // patch_size])
+    return generated_tokens.to(dtype=torch.int), patches
+def unpack(dec, width, height, parallel_size=5):
+    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+    dec = np.clip((dec + 1) / 2 * 255, 0, 255)
+    visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
+    visual_img[:, :, :] = dec
+    return visual_img
+@torch.inference_mode()
+# @spaces.GPU(duration=120)  # Specify a duration to avoid timeout
+def generate_image(prompt,
+                   seed=None,
+                   guidance=5,
+                   t2i_temperature=1.0):
+    # Clear CUDA cache and avoid tracking gradients
+    torch.cuda.empty_cache()
+    # Set the seed for reproducible results
+    if seed is not None:
+        torch.manual_seed(seed)
+        torch.cuda.manual_seed(seed)
+        np.random.seed(seed)
+    width = 384
+    height = 384
+    parallel_size = 5
+    with torch.no_grad():
+        messages = [{'role': '<|User|>', 'content': prompt},
+                    {'role': '<|Assistant|>', 'content': ''}]
+        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(conversations=messages,
+                                                                   sft_format=vl_chat_processor.sft_format,
+                                                                   system_prompt='')
+        text = text + vl_chat_processor.image_start_tag
+        input_ids = torch.LongTensor(tokenizer.encode(text))
+        output, patches = generate(input_ids,
+                                   width // 16 * 16,
+                                   height // 16 * 16,
+                                   cfg_weight=guidance,
+                                   parallel_size=parallel_size,
+                                   temperature=t2i_temperature)
+        images = unpack(patches,
+                        width // 16 * 16,
+                        height // 16 * 16,
+                        parallel_size=parallel_size)
+        return [Image.fromarray(images[i]).resize((768, 768), Image.LANCZOS) for i in range(parallel_size)]
+# Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown(value="# Multimodal Understanding")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image()
+            saliency_map_output = gr.Gallery(label="Saliency Map", columns=1, rows=1, height=300)
+        with gr.Column():
+            question_input = gr.Textbox(label="Question")
+            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
+            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
+            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
+            target_token_idx = gr.Number(label="target_token_idx", precision=0, value=300)
+    understanding_button = gr.Button("Chat")
+    understanding_output = gr.Textbox(label="Response")
+    understanding_target_token_decoded_output = gr.Textbox(label="Target Token Decoded")
+    examples_inpainting = gr.Examples(
+        label="Multimodal Understanding examples",
+        examples=[
+            [
+                "explain this meme",
+                "images/doge.png",
+            ],
+            [
+                "Convert the formula into latex code.",
+                "images/equation.png",
+            ],
+        ],
+        inputs=[question_input, image_input],
+    )
+    gr.Markdown(value="# Text-to-Image Generation")
+    with gr.Row():
+        cfg_weight_input = gr.Slider(minimum=1, maximum=10, value=5, step=0.5, label="CFG Weight")
+        t2i_temperature = gr.Slider(minimum=0, maximum=1, value=1.0, step=0.05, label="temperature")
+    prompt_input = gr.Textbox(label="Prompt. (Prompt in more detail can help produce better images!)")
+    seed_input = gr.Number(label="Seed (Optional)", precision=0, value=12345)
+    generation_button = gr.Button("Generate Images")
+    image_output = gr.Gallery(label="Generated Images", columns=2, rows=2, height=300)
+    examples_t2i = gr.Examples(
+        label="Text to image generation examples.",
+        examples=[
+            "Master shifu racoon wearing drip attire as a street gangster.",
+            "The face of a beautiful girl",
+            "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k",
+            "A glass of red wine on a reflective surface.",
+            "A cute and adorable baby fox with big brown eyes, autumn leaves in the background enchanting,immortal,fluffy, shiny mane,Petals,fairyism,unreal engine 5 and Octane Render,highly detailed, photorealistic, cinematic, natural colors.",
+            "The image features an intricately designed eye set against a circular backdrop adorned with ornate swirl patterns that evoke both realism and surrealism. At the center of attention is a strikingly vivid blue iris surrounded by delicate veins radiating outward from the pupil to create depth and intensity. The eyelashes are long and dark, casting subtle shadows on the skin around them which appears smooth yet slightly textured as if aged or weathered over time.\n\nAbove the eye, there's a stone-like structure resembling part of classical architecture, adding layers of mystery and timeless elegance to the composition. This architectural element contrasts sharply but harmoniously with the organic curves surrounding it. Below the eye lies another decorative motif reminiscent of baroque artistry, further enhancing the overall sense of eternity encapsulated within each meticulously crafted detail. \n\nOverall, the atmosphere exudes a mysterious aura intertwined seamlessly with elements suggesting timelessness, achieved through the juxtaposition of realistic textures and surreal artistic flourishes. Each component\u2014from the intricate designs framing the eye to the ancient-looking stone piece above\u2014contributes uniquely towards creating a visually captivating tableau imbued with enigmatic allure.",
+        ],
+        inputs=prompt_input,
+    )
+    understanding_button.click(
+        multimodal_understanding,
+        inputs=[image_input, question_input, und_seed_input, top_p, temperature, target_token_idx],
+        outputs=[understanding_output, saliency_map_output, understanding_target_token_decoded_output]
+    )
+    generation_button.click(
+        fn=generate_image,
+        inputs=[prompt_input, seed_input, cfg_weight_input, t2i_temperature],
+        outputs=image_output
+    )
+demo.launch(share=True)
+# demo.queue(concurrency_count=1, max_size=10).launch(server_name="0.0.0.0", server_port=37906, root_path="/path")

demo/app_vqa.py ADDED Viewed

	@@ -0,0 +1,333 @@

+import gradio as gr
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from janus.utils.io import load_pil_images
+from demo.cam import generate_gradcam, AttentionGuidedCAMJanus, AttentionGuidedCAMClip
+from demo.model_utils import Clip_Utils, Janus_Utils, add_title_to_image
+import numpy as np
+import matplotlib.pyplot as plt
+import gc
+from PIL import Image
+model_seed = 42
+torch.manual_seed(model_seed)
+np.random.seed(model_seed)
+torch.cuda.manual_seed(model_seed)
+model_type = "Janus-1B"
+janus_utils = Janus_Utils()
+vl_gpt, tokenizer = janus_utils.init_Janus(model_type.split('-')[-1])
+clip_utils = Clip_Utils()
+clip_utils.init_Clip()
+# @torch.inference_mode() # cancel inference, for gradcam
+# @spaces.GPU(duration=120)
+# Multimodal Understanding function
+def multimodal_understanding(model_type,
+                             saliency_map_method,
+                             visual_pooling_method,
+                             image, question, seed, top_p, temperature, target_token_idx,
+                             visualization_layer_min, visualization_layer_max, focus):
+    # Clear CUDA cache before generating
+    torch.cuda.empty_cache()
+    # set seed
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    input_text_decoded = ""
+    if model_type == "Clip":
+        inputs = clip_utils.prepare_inputs([question], image)
+        if saliency_map_method == "GradCAM":
+            # Generate Grad-CAM
+            all_layers = [layer.layer_norm1 for layer in clip_utils.model.vision_model.encoder.layers]
+            if visualization_layers_min.value != visualization_layers_max.value:
+                target_layers = all_layers[visualization_layer_min-1 : visualization_layer_max-1]
+            else:
+                target_layers = [all_layers[visualization_layer_min-1]]
+            grad_cam = AttentionGuidedCAMClip(clip_utils.model, target_layers)
+            cam, outputs, grid_size = grad_cam.generate_cam(inputs, class_idx=0, visual_pooling_method=visual_pooling_method)
+            cam = [generate_gradcam(cam, image, size=(224, 224))]
+            grad_cam.remove_hooks()
+            target_token_decoded = ""
+            answer = ""
+    elif model_type == "Janus-1B":
+        for param in vl_gpt.parameters():
+            param.requires_grad = True
+        prepare_inputs = janus_utils.prepare_inputs(question, image)
+        inputs_embeds = janus_utils.generate_inputs_embeddings(prepare_inputs)
+        outputs = janus_utils.generate_outputs(inputs_embeds, prepare_inputs, temperature, top_p)
+        sequences = outputs.sequences.cpu().tolist()
+        answer = tokenizer.decode(sequences[0], skip_special_tokens=True)
+        attention_raw = outputs.attentions
+        print("answer generated")
+        input_ids = prepare_inputs.input_ids[0].cpu().tolist()
+        input_ids_decoded = [tokenizer.decode([input_ids[i]]) for i in range(len(input_ids))]
+        start=620
+        if saliency_map_method == "GradCAM":
+            # target_layers = vl_gpt.vision_model.vision_tower.blocks
+            if focus == "Visual Encoder":
+                all_layers = [block.norm1 for block in vl_gpt.vision_model.vision_tower.blocks]
+            else:
+                all_layers = [layer.self_attn for layer in vl_gpt.language_model.model.layers]
+            if visualization_layers_min.value != visualization_layers_max.value:
+                target_layers = all_layers[visualization_layer_min-1 : visualization_layer_max-1]
+            else:
+                target_layers = [all_layers[visualization_layer_min-1]]
+            gradcam = AttentionGuidedCAMJanus(vl_gpt, target_layers)
+            cam_tensors, grid_size = gradcam.generate_cam(prepare_inputs, tokenizer, temperature, top_p, target_token_idx, visual_pooling_method, focus)
+            if focus == "Visual Encoder":
+                cam_grid = cam_tensors.reshape(grid_size, grid_size)
+                cam = [generate_gradcam(cam_grid, image)]
+            else:
+                if target_token_idx != -1:
+                    input_text_decoded = input_ids_decoded[start + target_token_idx]
+                    for i, cam_tensor in enumerate(cam_tensors):
+                        if i == target_token_idx:
+                            cam_grid = cam_tensor.reshape(grid_size, grid_size)
+                            cam_i = generate_gradcam(cam_grid, image)
+                            cam = [add_title_to_image(cam_i, input_text_decoded)]
+                            break
+                else:
+                    cam = []
+                    for i, cam_tensor in enumerate(cam_tensors):
+                        cam_grid = cam_tensor.reshape(24, 24)
+                        cam_i = generate_gradcam(cam_grid, image)
+                        cam_i = add_title_to_image(cam_i, input_ids_decoded[start + i])
+                        cam.append(cam_i)
+                    # widths, heights = zip(*(img.size for img in heatmaps))
+                    # total_height = sum(heights)
+                    # max_width = max(widths)
+                    # combined_img = Image.new("RGB", (max_width, total_height))
+                    # y_offset = 0
+                    # for img in heatmaps:
+                    #     combined_img.paste(img, (0, y_offset))  # Stack vertically
+                    #     y_offset += img.height
+                    # cam = combined_img
+        elif saliency_map_method == "Attention_Map":
+            attn_m_token = attention_raw[target_token_idx]
+            img_token_positions = prepare_inputs.images_seq_mask
+            mask = img_token_positions[0]
+            tg = attn_m_token[1][:, :, :, :len(mask)]
+            tg = tg[:, :, :, mask]
+            head = 0
+            # res = tg[0, head, 0].to(torch.float32)
+            res, _ = tg.max(dim=1)
+            # res = tg.sum(dim=1)
+            res = res.to(torch.float32)
+            grid_size = (int)(res.shape[-1] ** 0.5)
+            res = res.view(grid_size, grid_size)
+            cam = [generate_gradcam(res, image)]
+        # output_arr = output.logits.detach().to(float).to("cpu").numpy()
+        # predicted_ids = np.argmax(output_arr, axis=-1) # [1, num_tokens]
+        # predicted_ids = predicted_ids.squeeze(0) # [num_tokens]
+        # target_token_decoded = tokenizer.decode(predicted_ids[target_token_idx].tolist())
+    return answer, cam, input_text_decoded
+# Gradio interface
+def update_sliders(model):
+    if model == "Clip":
+        res = (
+            gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max"),
+            gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
+            )
+        return res
+    else:
+        res = (
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max"),
+            gr.Dropdown(choices=["Visual Encoder", "Language Model"], value="Visual Encoder", label="focus")
+        )
+        return res
+def update_visualization_layers_sliders(focus):
+    if focus == "Visual Encoder":
+        res = (
+            gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=24, value=24, step=1, label="visualization layers max")
+        )
+        return res
+    else:
+        res = (
+            gr.Dropdown(choices=["GradCAM", "Attention_Map"], value="GradCAM", label="saliency map type"),
+            gr.Slider(minimum=1, maximum=24, value=9, step=1, label="visualization layers min"),
+            gr.Slider(minimum=1, maximum=24, value=9, step=1, label="visualization layers max")
+        )
+        return res
+with gr.Blocks() as demo:
+    gr.Markdown(value="# Multimodal Understanding")
+    with gr.Row():
+        with gr.Column():
+            image_input = gr.Image()
+            saliency_map_output = gr.Gallery(label="Saliency Map", columns=1)
+        with gr.Column():
+            model_selector = gr.Dropdown(choices=["Clip", "Janus-1B"], value="Clip", label="model")
+            focus = gr.Dropdown(choices=["Visual Encoder"], value="Visual Encoder", label="focus")
+            saliency_map_method = gr.Dropdown(choices=["GradCAM"], value="GradCAM", label="saliency map type")
+            visual_pooling_method = gr.Dropdown(choices=["CLS", "max", "avg"], value="CLS", label="visual pooling method")
+            visualization_layers_min = gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers min")
+            visualization_layers_max = gr.Slider(minimum=1, maximum=12, value=12, step=1, label="visualization layers max")
+            question_input = gr.Textbox(label="Question")
+            und_seed_input = gr.Number(label="Seed", precision=0, value=42)
+            top_p = gr.Slider(minimum=0, maximum=1, value=0.95, step=0.05, label="top_p")
+            temperature = gr.Slider(minimum=0, maximum=1, value=0.1, step=0.05, label="temperature")
+            target_token_idx = gr.Number(label="target_token_idx (-1 means all)", precision=0, value=-1)
+        model_selector.change(
+            fn=update_sliders,
+            inputs=model_selector,
+            outputs=[
+                visualization_layers_min,
+                visualization_layers_max,
+                focus
+            ]
+        )
+        focus.change(
+            fn = update_visualization_layers_sliders,
+            inputs = focus,
+            outputs=[
+                saliency_map_method,
+                visualization_layers_min,
+                visualization_layers_max,
+            ]
+        )
+    understanding_button = gr.Button("Chat")
+    understanding_output = gr.Textbox(label="Response")
+    understanding_target_token_decoded_output = gr.Textbox(label="Target Token Decoded")
+    examples_inpainting = gr.Examples(
+        label="Multimodal Understanding examples",
+        examples=[
+            [
+                "What is the approximate global smartphone market share of Samsung?",
+                "images/PieChart.png"
+            ],
+            [
+                "What is the average internet speed in Japan?",
+                "images/BarChart.png"
+            ],
+            [
+                "What was the average price of coffee beans in October 2019?",
+                "images/AreaChart.png"
+            ],
+            [
+                "Which city's metro system has the largest number of stations?",
+                "images/BubbleChart.png"
+            ],
+            [
+                "True/False: In 2020, the unemployment rate for Washington (WA) was higher than that of Wisconsin (WI).",
+                "images/Choropleth_New.png"
+            ],
+            [
+                "What distance have customers traveled in the taxi the most?",
+                "images/Histogram.png"
+            ],
+            [
+                "What was the price of a barrel of oil in February 2020?",
+                "images/LineChart.png"
+            ],
+            [
+                "True/False: eBay is nested in the Software category.",
+                "images/Treemap.png"
+            ],
+            [
+                "True/False: There is a negative linear relationship between the height and the weight of the 85 males.",
+                "images/Scatterplot.png"
+            ],
+            [
+                "Which country has the lowest proportion of Gold medals?",
+                "images/Stacked100.png"
+            ],
+            [
+                "What was the ratio of girls named 'Isla' to girls named 'Amelia' in 2012 in the UK?",
+                "images/StackedArea.png"
+            ],
+            [
+                "What is the cost of peanuts in Seoul?",
+                "images/StackedBar.png"
+            ],
+            # [
+            #     "explain this meme",
+            #     "images/doge.png",
+            # ],
+            # [
+            #     "Convert the formula into latex code.",
+            #     "images/equation.png",
+            # ],
+        ],
+        inputs=[question_input, image_input],
+    )
+    understanding_button.click(
+        multimodal_understanding,
+        inputs=[model_selector, saliency_map_method, visual_pooling_method, image_input, question_input, und_seed_input, top_p, temperature, target_token_idx,
+                visualization_layers_min, visualization_layers_max, focus],
+        outputs=[understanding_output, saliency_map_output, understanding_target_token_decoded_output]
+    )
+demo.launch(share=True)
+# demo.queue(concurrency_count=1, max_size=10).launch(server_name="0.0.0.0", server_port=37906, root_path="/path")

demo/cam.py ADDED Viewed

	@@ -0,0 +1,486 @@

+import cv2
+import numpy as np
+import types
+import torch
+import torch.nn.functional as F
+import matplotlib.pyplot as plt
+from PIL import Image
+from torch import nn
+import spaces
+from demo.modify_llama import *
+class AttentionGuidedCAM:
+    def __init__(self, model):
+        self.model = model
+        self.gradients = []
+        self.activations = []
+        self.hooks = []
+        self._register_hooks()
+    def _register_hooks(self):
+        """ Registers hooks to extract activations and gradients from ALL attention layers. """
+        for layer in self.target_layers:
+            self.hooks.append(layer.register_forward_hook(self._forward_hook))
+            self.hooks.append(layer.register_backward_hook(self._backward_hook))
+    def _forward_hook(self, module, input, output):
+        """ Stores attention maps (before softmax) """
+        self.activations.append(output)
+    def _backward_hook(self, module, grad_in, grad_out):
+        """ Stores gradients """
+        self.gradients.append(grad_out[0])
+    def remove_hooks(self):
+        """ Remove hooks after usage. """
+        for hook in self.hooks:
+            hook.remove()
+    @spaces.GPU(duration=120)
+    def generate_cam(self, input_tensor, class_idx=None):
+        raise NotImplementedError
+class AttentionGuidedCAMClip(AttentionGuidedCAM):
+    def __init__(self, model, target_layers):
+        self.target_layers = target_layers
+        super().__init__(model)
+    @spaces.GPU(duration=120)
+    def generate_cam(self, input_tensor, class_idx=None, visual_pooling_method="CLS"):
+        """ Generates Grad-CAM heatmap for ViT. """
+        # Forward pass
+        output_full = self.model(**input_tensor)
+        if class_idx is None:
+            class_idx = torch.argmax(output_full.logits, dim=1).item()
+        if visual_pooling_method == "CLS":
+            output = output_full.image_embeds
+        elif visual_pooling_method == "avg":
+            output = self.model.visual_projection(output_full.vision_model_output.last_hidden_state).mean(dim=1)
+        else:
+            # project -> pooling
+            output, _ = self.model.visual_projection(output_full.vision_model_output.last_hidden_state).max(dim=1)
+            # pooling -> project
+            # output_mx, _ = output_full.vision_model_output.last_hidden_state.max(dim=1)
+            # output = self.model.visual_projection(output_mx)
+        output.backward(output_full.text_embeds[class_idx:class_idx+1], retain_graph=True)
+        # Aggregate activations and gradients from ALL layers
+        print(self.activations, self.gradients)
+        self.model.zero_grad()
+        cam_sum = None
+        for act, grad in zip(self.activations, self.gradients):
+            # act = torch.sigmoid(act[0])
+            act = F.relu(act[0])
+            grad_weights = grad.mean(dim=-1, keepdim=True)
+            print("act shape", act.shape)
+            print("grad_weights shape", grad_weights.shape)
+            # cam = (act * grad_weights).sum(dim=-1)  # Weighted activation map
+            cam, _ = (act * grad_weights).max(dim=-1)
+            # cam, _ = grad_weights.max(dim=-1)
+            # cam = self.normalize(cam)
+            print(cam.shape)
+            # Sum across all layers
+            if cam_sum is None:
+                cam_sum = cam
+            else:
+                cam_sum += cam
+        # Normalize
+        cam_sum = F.relu(cam_sum)
+        # thresholding
+        cam_sum = cam_sum.to(torch.float32)
+        percentile = torch.quantile(cam_sum, 0.2)  # Adjust threshold dynamically
+        cam_sum[cam_sum < percentile] = 0
+        # Reshape
+        print("cam_sum shape: ", cam_sum.shape)
+        cam_sum = cam_sum[0, 1:]
+        num_patches = cam_sum.shape[-1]  # Last dimension of CAM output
+        grid_size = int(num_patches ** 0.5)
+        print(f"Detected grid size: {grid_size}x{grid_size}")
+        cam_sum = cam_sum.view(grid_size, grid_size).detach()
+        cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
+        return cam_sum, output_full, grid_size
+class AttentionGuidedCAMJanus(AttentionGuidedCAM):
+    def __init__(self, model, target_layers):
+        self.target_layers = target_layers
+        super().__init__(model)
+        self._modify_layers()
+        self._register_hooks_activations()
+    def _modify_layers(self):
+        for layer in self.target_layers:
+            setattr(layer, "attn_gradients", None)
+            setattr(layer, "attention_map", None)
+            layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
+            layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
+            layer.save_attn_map = types.MethodType(save_attn_map, layer)
+            layer.get_attn_map = types.MethodType(get_attn_map, layer)
+    def _forward_activate_hooks(self, module, input, output):
+        attn_output, attn_weights = output  # Unpack outputs
+        module.save_attn_map(attn_weights)
+        attn_weights.register_hook(module.save_attn_gradients)
+    def _register_hooks_activations(self):
+        for layer in self.target_layers:
+            if hasattr(layer, "q_proj"): # is an attention layer
+                self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
+    @spaces.GPU(duration=120)
+    def generate_cam(self, input_tensor, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
+        """ Generates Grad-CAM heatmap for ViT. """
+        # Forward pass
+        image_embeddings, inputs_embeddings, outputs = self.model(input_tensor, tokenizer, temperature, top_p)
+        input_ids = input_tensor.input_ids
+        if focus == "Visual Encoder":
+            # Pooling
+            if visual_pooling_method == "CLS":
+                image_embeddings_pooled = image_embeddings[:, 0, :]
+            elif visual_pooling_method == "avg":
+                image_embeddings_pooled = image_embeddings[:, 1:, :].mean(dim=1) # end of image: 618
+            elif visual_pooling_method == "max":
+                image_embeddings_pooled, _ = image_embeddings[:, 1:, :].max(dim=1)
+            print("image_embeddings_shape: ", image_embeddings_pooled.shape)
+            inputs_embeddings_pooled = inputs_embeddings[:, 620: -4].mean(dim=1)
+            self.model.zero_grad()
+            image_embeddings_pooled.backward(inputs_embeddings_pooled, retain_graph=True)
+            cam_sum = None
+            for act, grad in zip(self.activations, self.gradients):
+                # act = torch.sigmoid(act)
+                act = F.relu(act[0])
+                # Compute mean of gradients
+                grad_weights = grad.mean(dim=-1, keepdim=True)
+                print("act shape", act.shape)
+                print("grad_weights shape", grad_weights.shape)
+                cam, _ = (act * grad_weights).max(dim=-1)
+                print(cam.shape)
+                # Sum across all layers
+                if cam_sum is None:
+                    cam_sum = cam
+                else:
+                    cam_sum += cam
+            # Normalize
+            cam_sum = F.relu(cam_sum)
+            # thresholding
+            cam_sum = cam_sum.to(torch.float32)
+            percentile = torch.quantile(cam_sum, 0.2)  # Adjust threshold dynamically
+            cam_sum[cam_sum < percentile] = 0
+            # Reshape
+            # if visual_pooling_method == "CLS":
+            cam_sum = cam_sum[0, 1:]
+            print("cam_sum shape: ", cam_sum.shape)
+            num_patches = cam_sum.shape[-1]  # Last dimension of CAM output
+            grid_size = int(num_patches ** 0.5)
+            print(f"Detected grid size: {grid_size}x{grid_size}")
+            cam_sum = cam_sum.view(grid_size, grid_size)
+            cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
+            cam_sum = cam_sum.detach().to("cpu")
+            return cam_sum, grid_size
+        elif focus == "Language Model":
+            loss = self.target_layers[-1].attention_map.sum()
+            self.model.zero_grad()
+            loss.backward()
+            self.activations = [layer.get_attn_map() for layer in self.target_layers]
+            self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
+            cam_sum = None
+            for act, grad in zip(self.activations, self.gradients):
+                # act = torch.sigmoid(act)
+                print("act:", act)
+                print(len(act))
+                print("act_shape:", act.shape)
+                # print("act1_shape:", act[1].shape)
+                act = F.relu(act.mean(dim=1))
+                # Compute mean of gradients
+                print("grad:", grad)
+                print(len(grad))
+                print("grad_shape:", grad.shape)
+                grad_weights = grad.mean(dim=1)
+                print("act:", act)
+                print("act shape", act.shape)
+                print("grad_weights shape", grad_weights.shape)
+                # cam, _ = (act * grad_weights).max(dim=-1)
+                # cam = act * grad_weights
+                cam = act * grad_weights
+                print(cam.shape)
+                # Sum across all layers
+                if cam_sum is None:
+                    cam_sum = cam
+                else:
+                    cam_sum += cam
+            # Normalize
+            cam_sum = F.relu(cam_sum)
+            # cam_sum = cam_sum - cam_sum.min()
+            # cam_sum = cam_sum / cam_sum.max()
+            # thresholding
+            cam_sum = cam_sum.to(torch.float32)
+            percentile = torch.quantile(cam_sum, 0.2)  # Adjust threshold dynamically
+            cam_sum[cam_sum < percentile] = 0
+            # Reshape
+            # if visual_pooling_method == "CLS":
+            # cam_sum = cam_sum[0, 1:]
+            # cam_sum shape: [1, seq_len, seq_len]
+            cam_sum_lst = []
+            cam_sum_raw = cam_sum
+            for i in range(620, cam_sum_raw.shape[1]):
+                cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
+                cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
+                print("cam_sum shape: ", cam_sum.shape)
+                num_patches = cam_sum.shape[-1]  # Last dimension of CAM output
+                grid_size = int(num_patches ** 0.5)
+                print(f"Detected grid size: {grid_size}x{grid_size}")
+                # Fix the reshaping step dynamically
+                cam_sum = cam_sum.view(grid_size, grid_size)
+                cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
+                cam_sum = cam_sum.detach().to("cpu")
+                cam_sum_lst.append(cam_sum)
+            return cam_sum_lst, grid_size
+        # Aggregate activations and gradients from ALL layers
+class AttentionGuidedCAMLLaVA(AttentionGuidedCAM):
+    def __init__(self, model, target_layers):
+        self.target_layers = target_layers
+        super().__init__(model)
+        self._modify_layers()
+        self._register_hooks_activations()
+    def _modify_layers(self):
+        for layer in self.target_layers:
+            setattr(layer, "attn_gradients", None)
+            setattr(layer, "attention_map", None)
+            layer.save_attn_gradients = types.MethodType(save_attn_gradients, layer)
+            layer.get_attn_gradients = types.MethodType(get_attn_gradients, layer)
+            layer.save_attn_map = types.MethodType(save_attn_map, layer)
+            layer.get_attn_map = types.MethodType(get_attn_map, layer)
+    def _forward_activate_hooks(self, module, input, output):
+        attn_output, attn_weights = output  # Unpack outputs
+        attn_weights.requires_grad_()
+        module.save_attn_map(attn_weights)
+        attn_weights.register_hook(module.save_attn_gradients)
+    def _register_hooks_activations(self):
+        for layer in self.target_layers:
+            if hasattr(layer, "q_proj"): # is an attention layer
+                self.hooks.append(layer.register_forward_hook(self._forward_activate_hooks))
+    @spaces.GPU(duration=120)
+    def generate_cam(self, input_tensor, tokenizer, temperature, top_p, class_idx=None, visual_pooling_method="CLS", focus="Visual Encoder"):
+        """ Generates Grad-CAM heatmap for ViT. """
+        # Forward pass
+        outputs_raw = self.model(**input_tensor)
+        if focus == "Language Model":
+            loss = self.target_layers[-1].attention_map.sum()
+            self.model.zero_grad()
+            loss.backward()
+            self.activations = [layer.get_attn_map() for layer in self.target_layers]
+            self.gradients = [layer.get_attn_gradients() for layer in self.target_layers]
+            cam_sum = None
+            for act, grad in zip(self.activations, self.gradients):
+                # act = torch.sigmoid(act)
+                print("act:", act)
+                print(len(act))
+                print("act_shape:", act.shape)
+                # print("act1_shape:", act[1].shape)
+                act = F.relu(act.mean(dim=1))
+                # Compute mean of gradients
+                print("grad:", grad)
+                print(len(grad))
+                print("grad_shape:", grad.shape)
+                grad_weights = grad.mean(dim=1)
+                print("act:", act)
+                print("act shape", act.shape)
+                print("grad_weights shape", grad_weights.shape)
+                # cam, _ = (act * grad_weights).max(dim=-1)
+                # cam = act * grad_weights
+                cam = act * grad_weights
+                print(cam.shape)
+                # Sum across all layers
+                if cam_sum is None:
+                    cam_sum = cam
+                else:
+                    cam_sum += cam
+            # Normalize
+            cam_sum = F.relu(cam_sum)
+            # cam_sum = cam_sum - cam_sum.min()
+            # cam_sum = cam_sum / cam_sum.max()
+            # thresholding
+            cam_sum = cam_sum.to(torch.float32)
+            percentile = torch.quantile(cam_sum, 0.2)  # Adjust threshold dynamically
+            cam_sum[cam_sum < percentile] = 0
+            # Reshape
+            # if visual_pooling_method == "CLS":
+            # cam_sum = cam_sum[0, 1:]
+            # cam_sum shape: [1, seq_len, seq_len]
+            cam_sum_lst = []
+            cam_sum_raw = cam_sum
+            grid_size = 32
+            for i in range(512, cam_sum_raw.shape[1]):
+                cam_sum = cam_sum_raw[:, i, :] # shape: [1: seq_len]
+                cam_sum = cam_sum[input_tensor.images_seq_mask].unsqueeze(0) # shape: [1, 576]
+                print("cam_sum shape: ", cam_sum.shape)
+                num_patches = cam_sum.shape[-1]  # Last dimension of CAM output
+                grid_size = int(num_patches ** 0.5)
+                print(f"Detected grid size: {grid_size}x{grid_size}")
+                # Fix the reshaping step dynamically
+                cam_sum = cam_sum.view(grid_size, grid_size)
+                cam_sum = (cam_sum - cam_sum.min()) / (cam_sum.max() - cam_sum.min())
+                cam_sum = cam_sum.detach().to("cpu")
+                cam_sum_lst.append(cam_sum)
+            return cam_sum_lst, grid_size
+def generate_gradcam(
+    cam,
+    image,
+    size = (384, 384),
+    alpha=0.5,
+    colormap=cv2.COLORMAP_JET,
+    aggregation='mean',
+    normalize=True
+):
+    """
+    Generates a Grad-CAM heatmap overlay on top of the input image.
+    Parameters:
+      attributions (torch.Tensor): A tensor of shape (C, H, W) representing the
+        intermediate activations or gradients at the target layer.
+      image (PIL.Image): The original image.
+      alpha (float): The blending factor for the heatmap overlay (default 0.5).
+      colormap (int): OpenCV colormap to apply (default cv2.COLORMAP_JET).
+      aggregation (str): How to aggregate across channels; either 'mean' or 'sum'.
+    Returns:
+      PIL.Image: The image overlaid with the Grad-CAM heatmap.
+    """
+    print("Generating Grad-CAM with shape:", cam.shape)
+    if normalize:
+        cam_min, cam_max = cam.min(), cam.max()
+        cam = cam - cam_min
+        cam = cam / (cam_max - cam_min)
+    # Convert tensor to numpy array
+    cam = torch.nn.functional.interpolate(cam.unsqueeze(0).unsqueeze(0), size=size, mode='bilinear').squeeze()
+    cam_np = cam.squeeze().detach().cpu().numpy()
+    # Apply Gaussian blur for smoother heatmaps
+    cam_np = cv2.GaussianBlur(cam_np, (5,5), sigmaX=0.8)
+    # Resize the cam to match the image size
+    width, height = size
+    cam_resized = cv2.resize(cam_np, (width, height))
+    # Convert the normalized map to a heatmap (0-255 uint8)
+    heatmap = np.uint8(255 * cam_resized)
+    heatmap = cv2.applyColorMap(heatmap, colormap)
+    # OpenCV produces heatmaps in BGR, so convert to RGB for consistency
+    heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB)
+    # Convert original image to a numpy array
+    image_np = np.array(image)
+    image_np = cv2.resize(image_np, (width, height))
+    # Blend the heatmap with the original image
+    overlay = cv2.addWeighted(image_np, 1 - alpha, heatmap, alpha, 0)
+    return Image.fromarray(overlay)

demo/demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

demo/demo_attn.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

demo/fastapi_app.py ADDED Viewed

	@@ -0,0 +1,178 @@

+from fastapi import FastAPI, File, Form, UploadFile, HTTPException
+from fastapi.responses import JSONResponse, StreamingResponse
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+from PIL import Image
+import numpy as np
+import io
+app = FastAPI()
+# Load model and processor
+model_path = "deepseek-ai/Janus-1.3B"
+config = AutoConfig.from_pretrained(model_path)
+language_config = config.language_config
+language_config._attn_implementation = 'eager'
+vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
+                                              language_config=language_config,
+                                              trust_remote_code=True)
+vl_gpt = vl_gpt.to(torch.bfloat16).cuda()
+vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+tokenizer = vl_chat_processor.tokenizer
+cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+@torch.inference_mode()
+def multimodal_understanding(image_data, question, seed, top_p, temperature):
+    torch.cuda.empty_cache()
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    torch.cuda.manual_seed(seed)
+    conversation = [
+        {
+            "role": "User",
+            "content": f"<image_placeholder>\n{question}",
+            "images": [image_data],
+        },
+        {"role": "Assistant", "content": ""},
+    ]
+    pil_images = [Image.open(io.BytesIO(image_data))]
+    prepare_inputs = vl_chat_processor(
+        conversations=conversation, images=pil_images, force_batchify=True
+    ).to(cuda_device, dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float16)
+    inputs_embeds = vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    outputs = vl_gpt.language_model.generate(
+        inputs_embeds=inputs_embeds,
+        attention_mask=prepare_inputs.attention_mask,
+        pad_token_id=tokenizer.eos_token_id,
+        bos_token_id=tokenizer.bos_token_id,
+        eos_token_id=tokenizer.eos_token_id,
+        max_new_tokens=512,
+        do_sample=False if temperature == 0 else True,
+        use_cache=True,
+        temperature=temperature,
+        top_p=top_p,
+    )
+    answer = tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True)
+    return answer
+@app.post("/understand_image_and_question/")
+async def understand_image_and_question(
+    file: UploadFile = File(...),
+    question: str = Form(...),
+    seed: int = Form(42),
+    top_p: float = Form(0.95),
+    temperature: float = Form(0.1)
+):
+    image_data = await file.read()
+    response = multimodal_understanding(image_data, question, seed, top_p, temperature)
+    return JSONResponse({"response": response})
+def generate(input_ids,
+             width,
+             height,
+             temperature: float = 1,
+             parallel_size: int = 5,
+             cfg_weight: float = 5,
+             image_token_num_per_image: int = 576,
+             patch_size: int = 16):
+    torch.cuda.empty_cache()
+    tokens = torch.zeros((parallel_size * 2, len(input_ids)), dtype=torch.int).to(cuda_device)
+    for i in range(parallel_size * 2):
+        tokens[i, :] = input_ids
+        if i % 2 != 0:
+            tokens[i, 1:-1] = vl_chat_processor.pad_id
+    inputs_embeds = vl_gpt.language_model.get_input_embeddings()(tokens)
+    generated_tokens = torch.zeros((parallel_size, image_token_num_per_image), dtype=torch.int).to(cuda_device)
+    pkv = None
+    for i in range(image_token_num_per_image):
+        outputs = vl_gpt.language_model.model(inputs_embeds=inputs_embeds, use_cache=True, past_key_values=pkv)
+        pkv = outputs.past_key_values
+        hidden_states = outputs.last_hidden_state
+        logits = vl_gpt.gen_head(hidden_states[:, -1, :])
+        logit_cond = logits[0::2, :]
+        logit_uncond = logits[1::2, :]
+        logits = logit_uncond + cfg_weight * (logit_cond - logit_uncond)
+        probs = torch.softmax(logits / temperature, dim=-1)
+        next_token = torch.multinomial(probs, num_samples=1)
+        generated_tokens[:, i] = next_token.squeeze(dim=-1)
+        next_token = torch.cat([next_token.unsqueeze(dim=1), next_token.unsqueeze(dim=1)], dim=1).view(-1)
+        img_embeds = vl_gpt.prepare_gen_img_embeds(next_token)
+        inputs_embeds = img_embeds.unsqueeze(dim=1)
+    patches = vl_gpt.gen_vision_model.decode_code(
+        generated_tokens.to(dtype=torch.int),
+        shape=[parallel_size, 8, width // patch_size, height // patch_size]
+    )
+    return generated_tokens.to(dtype=torch.int), patches
+def unpack(dec, width, height, parallel_size=5):
+    dec = dec.to(torch.float32).cpu().numpy().transpose(0, 2, 3, 1)
+    dec = np.clip((dec + 1) / 2 * 255, 0, 255)
+    visual_img = np.zeros((parallel_size, width, height, 3), dtype=np.uint8)
+    visual_img[:, :, :] = dec
+    return visual_img
+@torch.inference_mode()
+def generate_image(prompt, seed, guidance):
+    torch.cuda.empty_cache()
+    seed = seed if seed is not None else 12345
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    width = 384
+    height = 384
+    parallel_size = 5
+    with torch.no_grad():
+        messages = [{'role': 'User', 'content': prompt}, {'role': 'Assistant', 'content': ''}]
+        text = vl_chat_processor.apply_sft_template_for_multi_turn_prompts(
+            conversations=messages,
+            sft_format=vl_chat_processor.sft_format,
+            system_prompt=''
+        )
+        text = text + vl_chat_processor.image_start_tag
+        input_ids = torch.LongTensor(tokenizer.encode(text))
+        _, patches = generate(input_ids, width // 16 * 16, height // 16 * 16, cfg_weight=guidance, parallel_size=parallel_size)
+        images = unpack(patches, width // 16 * 16, height // 16 * 16)
+        return [Image.fromarray(images[i]).resize((1024, 1024), Image.LANCZOS) for i in range(parallel_size)]
+@app.post("/generate_images/")
+async def generate_images(
+    prompt: str = Form(...),
+    seed: int = Form(None),
+    guidance: float = Form(5.0),
+):
+    try:
+        images = generate_image(prompt, seed, guidance)
+        def image_stream():
+            for img in images:
+                buf = io.BytesIO()
+                img.save(buf, format='PNG')
+                buf.seek(0)
+                yield buf.read()
+        return StreamingResponse(image_stream(), media_type="multipart/related")
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Image generation failed: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

demo/fastapi_client.py ADDED Viewed

	@@ -0,0 +1,78 @@

+import requests
+from PIL import Image
+import io
+# Endpoint URLs
+understand_image_url = "http://localhost:8000/understand_image_and_question/"
+generate_images_url = "http://localhost:8000/generate_images/"
+# Use your image file path here
+image_path = "images/equation.png"
+# Function to call the image understanding endpoint
+def understand_image_and_question(image_path, question, seed=42, top_p=0.95, temperature=0.1):
+    files = {'file': open(image_path, 'rb')}
+    data = {
+        'question': question,
+        'seed': seed,
+        'top_p': top_p,
+        'temperature': temperature
+    }
+    response = requests.post(understand_image_url, files=files, data=data)
+    response_data = response.json()
+    print("Image Understanding Response:", response_data['response'])
+# Function to call the text-to-image generation endpoint
+def generate_images(prompt, seed=None, guidance=5.0):
+    data = {
+        'prompt': prompt,
+        'seed': seed,
+        'guidance': guidance
+    }
+    response = requests.post(generate_images_url, data=data, stream=True)
+    if response.ok:
+        img_idx = 1
+        # We will create a new BytesIO for each image
+        buffers = {}
+        try:
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:
+                    # Use a boundary detection to determine new image start
+                    if img_idx not in buffers:
+                        buffers[img_idx] = io.BytesIO()
+                    buffers[img_idx].write(chunk)
+                    # Attempt to open the image
+                    try:
+                        buffer = buffers[img_idx]
+                        buffer.seek(0)
+                        image = Image.open(buffer)
+                        img_path = f"generated_image_{img_idx}.png"
+                        image.save(img_path)
+                        print(f"Saved: {img_path}")
+                        # Prepare the next image buffer
+                        buffer.close()
+                        img_idx += 1
+                    except Exception as e:
+                        # Continue loading data into the current buffer
+                        continue
+        except Exception as e:
+            print("Error processing image:", e)
+    else:
+        print("Failed to generate images.")
+# Example usage
+if __name__ == "__main__":
+    # Call the image understanding API
+    understand_image_and_question(image_path, "What is this image about?")
+    # Call the image generation API
+    generate_images("A beautiful sunset over a mountain range, digital art.")

demo/model_utils.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import torch
+import numpy as np
+import spaces
+from PIL import Image, ImageDraw, ImageFont
+from transformers import AutoConfig, AutoModelForCausalLM, LlavaForConditionalGeneration, AutoProcessor
+from transformers import CLIPProcessor, CLIPModel
+from janus.models import MultiModalityCausalLM, VLChatProcessor
+@spaces.GPU(duration=120)
+def set_dtype_device(model, precision=16):
+    dtype = (torch.bfloat16 if torch.cuda.is_available() else torch.float16) if precision==16 else (torch.bfloat32 if torch.cuda.is_available() else torch.float32)
+    cuda_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    if torch.cuda.is_available():
+        model = model.to(dtype).cuda()
+    else:
+        torch.set_default_device("cpu")
+        model = model.to(dtype)
+    return model, dtype, cuda_device
+class Model_Utils:
+    def __init__(self):
+        pass
+    @spaces.GPU(duration=120)
+    def prepare_inputs(self):
+        raise NotImplementedError
+    @spaces.GPU(duration=120)
+    def generate_outputs(self):
+        raise NotImplementedError
+class Clip_Utils(Model_Utils):
+    def __init__(self):
+        self.edge = 224
+        super().__init__()
+    def init_Clip(self):
+        self.model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+        self.processor.feature_extractor.size = {"height": self.edge, "width": self.edge}
+    @spaces.GPU(duration=120)
+    def prepare_inputs(self, question_lst, image):
+        image = Image.fromarray(image)
+        print("image_size: ", image.size)
+        inputs = self.processor(text=question_lst, images=image, return_tensors="pt", padding=True)
+        return inputs
+class Janus_Utils(Model_Utils):
+    def __init__(self):
+        super().__init__()
+    def init_Janus(self, num_params="1B"):
+        model_path = f"deepseek-ai/Janus-Pro-{num_params}"
+        config = AutoConfig.from_pretrained(model_path)
+        language_config = config.language_config
+        language_config._attn_implementation = 'eager'
+        self.vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,
+                                                    language_config=language_config,
+                                                    trust_remote_code=True,
+                                                    ignore_mismatched_sizes=True,
+                                                    )
+        self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
+        self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path)
+        self.tokenizer = self.vl_chat_processor.tokenizer
+        return self.vl_gpt, self.tokenizer
+    @spaces.GPU(duration=120)
+    def prepare_inputs(self, question, image):
+        conversation = [
+            {
+                "role": "<|User|>",
+                "content": f"<image_placeholder>\n{question}",
+                "images": [image],
+            },
+            {"role": "<|Assistant|>", "content": ""},
+        ]
+        pil_images = [Image.fromarray(image)]
+        prepare_inputs = self.vl_chat_processor(
+            conversations=conversation, images=pil_images, force_batchify=True
+        ).to(self.cuda_device, dtype=self.dtype)
+        return prepare_inputs
+    @spaces.GPU(duration=120)
+    def generate_inputs_embeddings(self, prepare_inputs):
+        return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    @spaces.GPU(duration=120)
+    def generate_outputs(self, inputs_embeds, prepare_inputs, temperature, top_p, with_attn=False):
+        outputs = self.vl_gpt.language_model.generate(
+            inputs_embeds=inputs_embeds,
+            attention_mask=prepare_inputs.attention_mask,
+            pad_token_id=self.tokenizer.eos_token_id,
+            bos_token_id=self.tokenizer.bos_token_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+            max_new_tokens=512,
+            do_sample=False if temperature == 0 else True,
+            use_cache=True,
+            temperature=temperature,
+            top_p=top_p,
+            return_dict_in_generate=True,
+            output_attentions=True
+        )
+        return outputs
+class LLaVA_Utils(Model_Utils):
+    def __init__(self):
+        super().__init__()
+    def init_LLaVA(self):
+        model_path = f"llava-hf/llava-1.5-7b-hf"
+        config = AutoConfig.from_pretrained(model_path)
+        self.vl_gpt = LlavaForConditionalGeneration.from_pretrained(model_path,
+                                                    low_cpu_mem_usage=True,
+                                                    attn_implementation = 'eager',
+                                                    output_attentions=True
+                                                    )
+        self.vl_gpt, self.dtype, self.cuda_device = set_dtype_device(self.vl_gpt)
+        self.processor = AutoProcessor.from_pretrained(model_path)
+        self.tokenizer = self.processor.tokenizer
+        return self.vl_gpt, self.tokenizer
+    @spaces.GPU(duration=120)
+    def prepare_inputs(self, question, image):
+        conversation = [
+            {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": question},
+                {"type": "image"},
+                ],
+            },
+        ]
+        prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
+        pil_images = [Image.fromarray(image)]
+        prepare_inputs = self.processor(
+            images=pil_images, text=prompt, return_tensors="pt"
+        ).to(self.cuda_device, dtype=self.dtype)
+        return prepare_inputs
+    @spaces.GPU(duration=120)
+    def generate_inputs_embeddings(self, prepare_inputs):
+        return self.vl_gpt.prepare_inputs_embeds(**prepare_inputs)
+    @spaces.GPU(duration=120)
+    def generate_outputs(self, prepare_inputs, temperature, top_p):
+        outputs = self.vl_gpt.generate(
+            **prepare_inputs,
+            max_new_tokens=512,
+            do_sample=False if temperature == 0 else True,
+            use_cache=True,
+            return_dict_in_generate=True,
+            output_attentions=True
+        )
+        return outputs
+def add_title_to_image(image, title, font_size=20):
+    """Adds a title above an image using PIL and textbbox()."""
+    img_width, img_height = image.size
+    # Create a blank image for title
+    title_height = font_size + 10  # Some padding
+    title_image = Image.new("RGB", (img_width, title_height), color=(255, 255, 255))  # White background
+    draw = ImageDraw.Draw(title_image)
+    # Load font
+    try:
+        font = ImageFont.truetype("arial.ttf", font_size)  # Use Arial if available
+    except:
+        font = ImageFont.load_default()  # Use default if Arial not found
+    # Get text size (updated for PIL >= 10)
+    text_bbox = draw.textbbox((0, 0), title, font=font)
+    text_width = text_bbox[2] - text_bbox[0]
+    text_height = text_bbox[3] - text_bbox[1]
+    # Center the title
+    text_position = ((img_width - text_width) // 2, (title_height - text_height) // 2)
+    draw.text(text_position, title, fill="black", font=font)
+    # Concatenate title with image
+    combined = Image.new("RGB", (img_width, img_height + title_height))
+    combined.paste(title_image, (0, 0))  # Place title at the top
+    combined.paste(image, (0, title_height))  # Place original image below
+    return combined

demo/modify_llama.py ADDED Viewed

	@@ -0,0 +1,11 @@

+def save_attn_gradients(self, attn_gradients):
+    self.attn_gradients = attn_gradients
+def get_attn_gradients(self):
+    return self.attn_gradients
+def save_attn_map(self, attention_map):
+    self.attention_map = attention_map
+def get_attn_map(self):
+    return self.attention_map

demo/visualize_architecture.ipynb ADDED Viewed

	@@ -0,0 +1,1715 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Austi\\anaconda3\\envs\\janus_env\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+      "  from .autonotebook import tqdm as notebook_tqdm\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Python version is above 3.10, patching the collections module.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "c:\\Users\\Austi\\anaconda3\\envs\\janus_env\\lib\\site-packages\\transformers\\models\\auto\\image_processing_auto.py:590: FutureWarning: The image_processor_class argument is deprecated and will be removed in v4.42. Please use `slow_image_processor_class`, or `fast_image_processor_class` instead\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import gradio as gr\n",
+    "import torch\n",
+    "from transformers import AutoConfig, AutoModelForCausalLM\n",
+    "from janus.models import MultiModalityCausalLM, VLChatProcessor\n",
+    "from janus.utils.io import load_pil_images\n",
+    "from demo.cam import generate_gradcam, AttentionGuidedCAM\n",
+    "from captum.attr import LayerGradCam\n",
+    "from PIL import Image\n",
+    "from einops import rearrange\n",
+    "\n",
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import os\n",
+    "import time\n",
+    "\n",
+    "import torch.nn.functional as F\n",
+    "from scipy.ndimage import filters\n",
+    "from torch import nn\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Usage Class Token:  True\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of MultiModalityCausalLM were not initialized from the model checkpoint at deepseek-ai/Janus-Pro-1B and are newly initialized: ['vision_model.vision_tower.cls_token']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Some weights of MultiModalityCausalLM were not initialized from the model checkpoint at deepseek-ai/Janus-Pro-1B and are newly initialized because the shapes did not match:\n",
+      "- vision_model.vision_tower.pos_embed: found shape torch.Size([1, 576, 1024]) in the checkpoint and torch.Size([1, 577, 1024]) in the model instantiated\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
+      "Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.\n",
+      "You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message.\n",
+      "Some kwargs in processor config are unused and will not have any effect: num_image_tokens, sft_format, image_tag, ignore_id, add_special_token, mask_prompt. \n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "model_path = \"deepseek-ai/Janus-Pro-1B\"\n",
+    "config = AutoConfig.from_pretrained(model_path)\n",
+    "language_config = config.language_config\n",
+    "language_config._attn_implementation = 'eager'\n",
+    "vl_gpt = AutoModelForCausalLM.from_pretrained(model_path,\n",
+    "                                             language_config=language_config,\n",
+    "                                             trust_remote_code=True,\n",
+    "                                             ignore_mismatched_sizes=True # Adding CLS token, will be handled manually\n",
+    "                                             )\n",
+    "\n",
+    "dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float16\n",
+    "# dtype = torch.bfloat32 if torch.cuda.is_available() else torch.float32\n",
+    "\n",
+    "if torch.cuda.is_available():\n",
+    "    vl_gpt = vl_gpt.to(dtype).cuda()\n",
+    "else:\n",
+    "    # vl_gpt = vl_gpt.to(torch.float16)\n",
+    "    torch.set_default_device(\"mps\")\n",
+    "    vl_gpt = vl_gpt.to(dtype)\n",
+    "\n",
+    "vl_chat_processor = VLChatProcessor.from_pretrained(model_path)\n",
+    "tokenizer = vl_chat_processor.tokenizer\n",
+    "cuda_device = 'cuda' if torch.cuda.is_available() else 'mps'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CLIPVisionTower(\n",
+      "  (vision_tower): VisionTransformer(\n",
+      "    (patch_embed): PatchEmbed(\n",
+      "      (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))\n",
+      "      (norm): Identity()\n",
+      "    )\n",
+      "    (pos_drop): Dropout(p=0.0, inplace=False)\n",
+      "    (patch_drop): Identity()\n",
+      "    (norm_pre): Identity()\n",
+      "    (blocks): Sequential(\n",
+      "      (0): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (1): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (2): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (3): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (4): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (5): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (6): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (7): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (8): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (9): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (10): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (11): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (12): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (13): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (14): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (15): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (16): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (17): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (18): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (19): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (20): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (21): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (22): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "      (23): Block(\n",
+      "        (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (attn): Attention(\n",
+      "          (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "          (q_norm): Identity()\n",
+      "          (k_norm): Identity()\n",
+      "          (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "          (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "          (proj_drop): Identity()\n",
+      "        )\n",
+      "        (ls1): Identity()\n",
+      "        (drop_path1): Identity()\n",
+      "        (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "        (ls2): Identity()\n",
+      "        (drop_path2): Identity()\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "    (attn_pool): AttentionPoolLatent(\n",
+      "      (q): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "      (kv): Linear(in_features=1024, out_features=2048, bias=True)\n",
+      "      (q_norm): Identity()\n",
+      "      (k_norm): Identity()\n",
+      "      (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "      (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "      (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "      (mlp): Mlp(\n",
+      "        (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "        (act): GELU(approximate='none')\n",
+      "        (drop1): Dropout(p=0.0, inplace=False)\n",
+      "        (norm): Identity()\n",
+      "        (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "        (drop2): Dropout(p=0.0, inplace=False)\n",
+      "      )\n",
+      "    )\n",
+      "    (fc_norm): Identity()\n",
+      "    (head_drop): Dropout(p=0.0, inplace=False)\n",
+      "    (head): Identity()\n",
+      "  )\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(vl_gpt.vision_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "LlamaForCausalLM(\n",
+      "  (model): LlamaModel(\n",
+      "    (embed_tokens): Embedding(102400, 2048)\n",
+      "    (layers): ModuleList(\n",
+      "      (0-23): 24 x LlamaDecoderLayer(\n",
+      "        (self_attn): LlamaAttention(\n",
+      "          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "        )\n",
+      "        (mlp): LlamaMLP(\n",
+      "          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
+      "          (act_fn): SiLU()\n",
+      "        )\n",
+      "        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n",
+      "        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n",
+      "      )\n",
+      "    )\n",
+      "    (norm): LlamaRMSNorm((2048,), eps=1e-06)\n",
+      "    (rotary_emb): LlamaRotaryEmbedding()\n",
+      "  )\n",
+      "  (lm_head): Linear(in_features=2048, out_features=102400, bias=False)\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(vl_gpt.language_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "MultiModalityCausalLM(\n",
+      "  (vision_model): CLIPVisionTower(\n",
+      "    (vision_tower): VisionTransformer(\n",
+      "      (patch_embed): PatchEmbed(\n",
+      "        (proj): Conv2d(3, 1024, kernel_size=(16, 16), stride=(16, 16))\n",
+      "        (norm): Identity()\n",
+      "      )\n",
+      "      (pos_drop): Dropout(p=0.0, inplace=False)\n",
+      "      (patch_drop): Identity()\n",
+      "      (norm_pre): Identity()\n",
+      "      (blocks): Sequential(\n",
+      "        (0): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (1): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (2): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (3): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (4): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (5): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (6): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (7): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (8): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (9): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (10): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (11): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (12): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (13): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (14): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (15): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (16): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (17): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (18): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (19): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (20): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (21): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (22): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "        (23): Block(\n",
+      "          (norm1): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (attn): Attention(\n",
+      "            (qkv): Linear(in_features=1024, out_features=3072, bias=True)\n",
+      "            (q_norm): Identity()\n",
+      "            (k_norm): Identity()\n",
+      "            (attn_drop): Dropout(p=0.0, inplace=False)\n",
+      "            (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "            (proj_drop): Identity()\n",
+      "          )\n",
+      "          (ls1): Identity()\n",
+      "          (drop_path1): Identity()\n",
+      "          (norm2): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "          (mlp): Mlp(\n",
+      "            (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "            (act): GELU(approximate='none')\n",
+      "            (drop1): Dropout(p=0.0, inplace=False)\n",
+      "            (norm): Identity()\n",
+      "            (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "            (drop2): Dropout(p=0.0, inplace=False)\n",
+      "          )\n",
+      "          (ls2): Identity()\n",
+      "          (drop_path2): Identity()\n",
+      "        )\n",
+      "      )\n",
+      "      (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "      (attn_pool): AttentionPoolLatent(\n",
+      "        (q): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "        (kv): Linear(in_features=1024, out_features=2048, bias=True)\n",
+      "        (q_norm): Identity()\n",
+      "        (k_norm): Identity()\n",
+      "        (proj): Linear(in_features=1024, out_features=1024, bias=True)\n",
+      "        (proj_drop): Dropout(p=0.0, inplace=False)\n",
+      "        (norm): LayerNorm((1024,), eps=1e-06, elementwise_affine=True)\n",
+      "        (mlp): Mlp(\n",
+      "          (fc1): Linear(in_features=1024, out_features=4096, bias=True)\n",
+      "          (act): GELU(approximate='none')\n",
+      "          (drop1): Dropout(p=0.0, inplace=False)\n",
+      "          (norm): Identity()\n",
+      "          (fc2): Linear(in_features=4096, out_features=1024, bias=True)\n",
+      "          (drop2): Dropout(p=0.0, inplace=False)\n",
+      "        )\n",
+      "      )\n",
+      "      (fc_norm): Identity()\n",
+      "      (head_drop): Dropout(p=0.0, inplace=False)\n",
+      "      (head): Identity()\n",
+      "    )\n",
+      "  )\n",
+      "  (aligner): MlpProjector(\n",
+      "    (layers): Sequential(\n",
+      "      (0): Linear(in_features=1024, out_features=2048, bias=True)\n",
+      "      (1): GELU(approximate='none')\n",
+      "      (2): Linear(in_features=2048, out_features=2048, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      "  (gen_vision_model): VQModel(\n",
+      "    (encoder): Encoder(\n",
+      "      (conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "      (conv_blocks): ModuleList(\n",
+      "        (0-1): 2 x Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0-1): 2 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "          (downsample): Downsample(\n",
+      "            (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(2, 2))\n",
+      "          )\n",
+      "        )\n",
+      "        (2): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0): ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (nin_shortcut): Conv2d(128, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+      "            )\n",
+      "            (1): ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "          (downsample): Downsample(\n",
+      "            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))\n",
+      "          )\n",
+      "        )\n",
+      "        (3): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0-1): 2 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "          (downsample): Downsample(\n",
+      "            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(2, 2))\n",
+      "          )\n",
+      "        )\n",
+      "        (4): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0): ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (nin_shortcut): Conv2d(256, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "            )\n",
+      "            (1): ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList(\n",
+      "            (0-1): 2 x AttnBlock(\n",
+      "              (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "              (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "              (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "              (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "        )\n",
+      "      )\n",
+      "      (mid): ModuleList(\n",
+      "        (0): ResnetBlock(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "        )\n",
+      "        (1): AttnBlock(\n",
+      "          (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "          (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "          (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "          (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "        )\n",
+      "        (2): ResnetBlock(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "        )\n",
+      "      )\n",
+      "      (norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "      (conv_out): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "    )\n",
+      "    (decoder): Decoder(\n",
+      "      (conv_in): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "      (mid): ModuleList(\n",
+      "        (0): ResnetBlock(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "        )\n",
+      "        (1): AttnBlock(\n",
+      "          (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "          (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "          (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "          (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "        )\n",
+      "        (2): ResnetBlock(\n",
+      "          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "          (dropout): Dropout(p=0.0, inplace=False)\n",
+      "          (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "        )\n",
+      "      )\n",
+      "      (conv_blocks): ModuleList(\n",
+      "        (0): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0-2): 3 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList(\n",
+      "            (0-2): 3 x AttnBlock(\n",
+      "              (norm): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (q): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "              (k): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "              (v): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "              (proj_out): Conv2d(512, 512, kernel_size=(1, 1), stride=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (upsample): Upsample(\n",
+      "            (conv): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          )\n",
+      "        )\n",
+      "        (1): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0): ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (nin_shortcut): Conv2d(512, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+      "            )\n",
+      "            (1-2): 2 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "          (upsample): Upsample(\n",
+      "            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          )\n",
+      "        )\n",
+      "        (2): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0-2): 3 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "          (upsample): Upsample(\n",
+      "            (conv): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          )\n",
+      "        )\n",
+      "        (3): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0): ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (nin_shortcut): Conv2d(256, 128, kernel_size=(1, 1), stride=(1, 1))\n",
+      "            )\n",
+      "            (1-2): 2 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "          (upsample): Upsample(\n",
+      "            (conv): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "          )\n",
+      "        )\n",
+      "        (4): Module(\n",
+      "          (res): ModuleList(\n",
+      "            (0-2): 3 x ResnetBlock(\n",
+      "              (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (conv1): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "              (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "              (dropout): Dropout(p=0.0, inplace=False)\n",
+      "              (conv2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "            )\n",
+      "          )\n",
+      "          (attn): ModuleList()\n",
+      "        )\n",
+      "      )\n",
+      "      (norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)\n",
+      "      (conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n",
+      "    )\n",
+      "    (quantize): VectorQuantizer(\n",
+      "      (embedding): Embedding(16384, 8)\n",
+      "    )\n",
+      "    (quant_conv): Conv2d(256, 8, kernel_size=(1, 1), stride=(1, 1))\n",
+      "    (post_quant_conv): Conv2d(8, 256, kernel_size=(1, 1), stride=(1, 1))\n",
+      "  )\n",
+      "  (gen_aligner): MlpProjector(\n",
+      "    (layers): Sequential(\n",
+      "      (0): Linear(in_features=8, out_features=2048, bias=True)\n",
+      "      (1): GELU(approximate='none')\n",
+      "      (2): Linear(in_features=2048, out_features=2048, bias=True)\n",
+      "    )\n",
+      "  )\n",
+      "  (gen_head): vision_head(\n",
+      "    (output_mlp_projector): Linear(in_features=2048, out_features=2048, bias=True)\n",
+      "    (vision_activation): GELU(approximate='none')\n",
+      "    (vision_head): Linear(in_features=2048, out_features=16384, bias=True)\n",
+      "  )\n",
+      "  (gen_embed): Embedding(16384, 8)\n",
+      "  (language_model): LlamaForCausalLM(\n",
+      "    (model): LlamaModel(\n",
+      "      (embed_tokens): Embedding(102400, 2048)\n",
+      "      (layers): ModuleList(\n",
+      "        (0-23): 24 x LlamaDecoderLayer(\n",
+      "          (self_attn): LlamaAttention(\n",
+      "            (q_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "            (k_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "            (v_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "            (o_proj): Linear(in_features=2048, out_features=2048, bias=False)\n",
+      "          )\n",
+      "          (mlp): LlamaMLP(\n",
+      "            (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "            (up_proj): Linear(in_features=2048, out_features=5632, bias=False)\n",
+      "            (down_proj): Linear(in_features=5632, out_features=2048, bias=False)\n",
+      "            (act_fn): SiLU()\n",
+      "          )\n",
+      "          (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n",
+      "          (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)\n",
+      "        )\n",
+      "      )\n",
+      "      (norm): LlamaRMSNorm((2048,), eps=1e-06)\n",
+      "      (rotary_emb): LlamaRotaryEmbedding()\n",
+      "    )\n",
+      "    (lm_head): Linear(in_features=2048, out_features=102400, bias=False)\n",
+      "  )\n",
+      ")\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(vl_gpt)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "janus_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

images/AreaChart.png ADDED Viewed

images/BarChart.png ADDED Viewed

images/BubbleChart.png ADDED Viewed

images/Choropleth_New.png ADDED Viewed

images/Histogram.png ADDED Viewed

images/LineChart.png ADDED Viewed

images/PieChart.png ADDED Viewed

images/Scatterplot.png ADDED Viewed

images/Stacked100.png ADDED Viewed

images/StackedArea.png ADDED Viewed

images/StackedBar.png ADDED Viewed

images/TreeMap.png ADDED Viewed

images/badge.svg ADDED Viewed

images/cat_dog.png ADDED Viewed

images/doge.png ADDED Viewed

images/equation.png ADDED Viewed

images/logo.png ADDED Viewed

images/logo.svg ADDED Viewed

images/pie_chart.png ADDED Viewed

images/ve.png ADDED Viewed

janus/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# check if python version is above 3.10
+import sys
+if sys.version_info >= (3, 10):
+    print("Python version is above 3.10, patching the collections module.")
+    # Monkey patch collections
+    import collections
+    import collections.abc
+    for type_name in collections.abc.__all__:
+        setattr(collections, type_name, getattr(collections.abc, type_name))

janus/janusflow/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# check if python version is above 3.10
+import sys
+if sys.version_info >= (3, 10):
+    print("Python version is above 3.10, patching the collections module.")
+    # Monkey patch collections
+    import collections
+    import collections.abc
+    for type_name in collections.abc.__all__:
+        setattr(collections, type_name, getattr(collections.abc, type_name))

janus/janusflow/models/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from .image_processing_vlm import VLMImageProcessor
+from .modeling_vlm import MultiModalityCausalLM
+from .processing_vlm import VLChatProcessor
+__all__ = [
+    "VLMImageProcessor",
+    "VLChatProcessor",
+    "MultiModalityCausalLM",
+]

janus/janusflow/models/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,122 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from typing import Dict, List, Literal, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torchvision.transforms
+from einops import rearrange
+from janus.janusflow.models.siglip_vit import create_siglip_vit
+class CLIPVisionTower(nn.Module):
+    def __init__(
+        self,
+        model_name: str = "siglip_large_patch16_384",
+        image_size: Union[Tuple[int, int], int] = 336,
+        select_feature: str = "patch",
+        select_layer: int = -2,
+        select_layers: list = None,
+        ckpt_path: str = "",
+        pixel_mean: Optional[List[float]] = None,
+        pixel_std: Optional[List[float]] = None,
+        **kwargs,
+    ):
+        super().__init__()
+        self.model_name = model_name
+        self.select_feature = select_feature
+        self.select_layer = select_layer
+        self.select_layers = select_layers
+        vision_tower_params = {
+            "model_name": model_name,
+            "image_size": image_size,
+            "ckpt_path": ckpt_path,
+            "select_layer": select_layer,
+        }
+        vision_tower_params.update(kwargs)
+        self.vision_tower, self.forward_kwargs = self.build_vision_tower(
+            vision_tower_params
+        )
+        if pixel_mean is not None and pixel_std is not None:
+            image_norm = torchvision.transforms.Normalize(
+                mean=pixel_mean, std=pixel_std
+            )
+        else:
+            image_norm = None
+        self.image_norm = image_norm
+    def build_vision_tower(self, vision_tower_params):
+        if self.model_name.startswith("siglip"):
+            self.select_feature = "same"
+            vision_tower = create_siglip_vit(**vision_tower_params)
+            forward_kwargs = dict()
+        elif self.model_name.startswith("sam"):
+            vision_tower = create_sam_vit(**vision_tower_params)
+            forward_kwargs = dict()
+        else:  # huggingface
+            from transformers import CLIPVisionModel
+            vision_tower = CLIPVisionModel.from_pretrained(**vision_tower_params)
+            forward_kwargs = dict(output_hidden_states=True)
+        return vision_tower, forward_kwargs
+    def feature_select(self, image_forward_outs):
+        if isinstance(image_forward_outs, torch.Tensor):
+            # the output has been the self.select_layer"s features
+            image_features = image_forward_outs
+        else:
+            image_features = image_forward_outs.hidden_states[self.select_layer]
+        if self.select_feature == "patch":
+            # if the output has cls_token
+            image_features = image_features[:, 1:]
+        elif self.select_feature == "cls_patch":
+            image_features = image_features
+        elif self.select_feature == "same":
+            image_features = image_features
+        else:
+            raise ValueError(f"Unexpected select feature: {self.select_feature}")
+        return image_features
+    def forward(self, images):
+        """
+        Args:
+            images (torch.Tensor): [b, 3, H, W]
+        Returns:
+            image_features (torch.Tensor): [b, n_patch, d]
+        """
+        if self.image_norm is not None:
+            images = self.image_norm(images)
+        image_forward_outs = self.vision_tower(images, **self.forward_kwargs)
+        image_features = self.feature_select(image_forward_outs)
+        return image_features

janus/janusflow/models/image_processing_vlm.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from typing import List, Tuple, Union
+import numpy as np
+import torch
+import torchvision
+import torchvision.transforms.functional
+from PIL import Image
+from transformers import AutoImageProcessor, PretrainedConfig
+from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
+from transformers.image_utils import to_numpy_array
+from transformers.utils import logging
+logger = logging.get_logger(__name__)
+ImageType = Union[np.ndarray, torch.Tensor, Image.Image]
+IMAGENET_MEAN = (0.48145466, 0.4578275, 0.40821073)
+IMAGENET_STD = (0.26862954, 0.26130258, 0.27577711)
+IMAGENET_INCEPTION_MEAN = (0.5, 0.5, 0.5)
+IMAGENET_INCEPTION_STD = (0.5, 0.5, 0.5)
+def expand2square(pil_img, background_color):
+    width, height = pil_img.size
+    if width == height:
+        return pil_img
+    elif width > height:
+        result = Image.new(pil_img.mode, (width, width), background_color)
+        result.paste(pil_img, (0, (width - height) // 2))
+        return result
+    else:
+        result = Image.new(pil_img.mode, (height, height), background_color)
+        result.paste(pil_img, ((height - width) // 2, 0))
+        return result
+class VLMImageProcessorConfig(PretrainedConfig):
+    model_type = "deepseek_vlm"
+    image_size: int
+    min_size: int
+    image_mean: Union[Tuple[float, float, float], List[float]]
+    image_std: Union[Tuple[float, float, float], List[float]]
+    rescale_factor: float
+    do_normalize: bool
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        self.image_size = image_size
+        self.min_size = min_size
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        super().__init__(**kwargs)
+class VLMImageProcessor(BaseImageProcessor):
+    model_input_names = ["pixel_values"]
+    def __init__(
+        self,
+        image_size: int,
+        min_size: int = 14,
+        image_mean: Union[Tuple[float, float, float], List[float]] = (
+            0.48145466,
+            0.4578275,
+            0.40821073,
+        ),
+        image_std: Union[Tuple[float, float, float], List[float]] = (
+            0.26862954,
+            0.26130258,
+            0.27577711,
+        ),
+        rescale_factor: float = 1.0 / 255.0,
+        do_normalize: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self.min_size = min_size
+        self.do_normalize = do_normalize
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple([int(x * 255) for x in image_mean])
+    def resize(self, pil_img: Image) -> np.ndarray:
+        """
+        Args:
+            pil_img (PIL.Image): [H, W, 3] in PIL.Image in RGB
+        Returns:
+            x (np.ndarray): [3, self.image_size, self.image_size]
+        """
+        width, height = pil_img.size
+        max_size = max(width, height)
+        size = [
+            max(int(height / max_size * self.image_size), self.min_size),
+            max(int(width / max_size * self.image_size), self.min_size),
+        ]
+        if width <= 0 or height <= 0 or size[0] <= 0 or size[1] <= 0:
+            print(f"orig size = {pil_img.size}, new size = {size}")
+            raise ValueError("Invalid size!")
+        pil_img = torchvision.transforms.functional.resize(
+            pil_img,
+            size,
+            interpolation=torchvision.transforms.functional.InterpolationMode.BICUBIC,
+            antialias=True,
+        )
+        pil_img = expand2square(pil_img, self.background_color)
+        x = to_numpy_array(pil_img)
+        # [H, W, 3] -> [3, H, W]
+        x = np.transpose(x, (2, 0, 1))
+        return x
+    def preprocess(self, images, return_tensors: str = "pt", **kwargs) -> BatchFeature:
+        # resize and pad to [self.image_size, self.image_size]
+        # then convert from [H, W, 3] to [3, H, W]
+        images: List[np.ndarray] = [self.resize(image) for image in images]
+        # resacle from [0, 255] -> [0, 1]
+        images = [
+            self.rescale(
+                image=image,
+                scale=self.rescale_factor,
+                input_data_format="channels_first",
+            )
+            for image in images
+        ]
+        # normalize
+        if self.do_normalize:
+            images = [
+                self.normalize(
+                    image=image,
+                    mean=self.image_mean,
+                    std=self.image_std,
+                    input_data_format="channels_first",
+                )
+                for image in images
+            ]
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+    @property
+    def default_shape(self):
+        return [3, self.image_size, self.image_size]
+AutoImageProcessor.register(VLMImageProcessorConfig, VLMImageProcessor)
+if __name__ == "__main__":
+    image_processor = VLMImageProcessor(
+        image_size=1024,
+        image_mean=IMAGENET_INCEPTION_MEAN,
+        image_std=IMAGENET_INCEPTION_STD,
+        do_normalize=True,
+    )

janus/janusflow/models/modeling_vlm.py ADDED Viewed

	@@ -0,0 +1,226 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from attrdict import AttrDict
+from einops import rearrange
+import torch
+from transformers.configuration_utils import PretrainedConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForCausalLM,
+    PreTrainedModel,
+    LlamaConfig,
+    LlamaForCausalLM,
+)
+from transformers.models.llama.modeling_llama import LlamaRMSNorm
+from janus.janusflow.models.clip_encoder import CLIPVisionTower
+from janus.janusflow.models.uvit import ShallowUViTEncoder, ShallowUViTDecoder
+import torch.nn as nn
+def model_name_to_cls(cls_name):
+    if "CLIPVisionTower" in cls_name:
+        cls = CLIPVisionTower
+    elif "ShallowUViTEncoder" in cls_name:
+        cls = ShallowUViTEncoder
+    elif "ShallowUViTDecoder" in cls_name:
+        cls = ShallowUViTDecoder
+    else:
+        raise ValueError(f"class_name {cls_name} is invalid.")
+    return cls
+class VisionUnderstandEncoderConfig(PretrainedConfig):
+    model_type = "vision_und_enc"
+    cls: str = ""
+    params: AttrDict = {}
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+        self.params = AttrDict(kwargs.get("params", {}))
+class VisionGenerationEncoderConfig(PretrainedConfig):
+    model_type = "vision_gen_enc"
+    cls: str = ""
+    params: AttrDict = {}
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+        self.params = AttrDict(kwargs.get("params", {}))
+class VisionGenerationDecoderConfig(PretrainedConfig):
+    model_type = "vision_gen_dec"
+    cls: str = ""
+    params: AttrDict = {}
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.cls = kwargs.get("cls", "")
+        if not isinstance(self.cls, str):
+            self.cls = self.cls.__name__
+        self.params = AttrDict(kwargs.get("params", {}))
+class MultiModalityConfig(PretrainedConfig):
+    model_type = "multi_modality"
+    vision_und_enc_config: VisionUnderstandEncoderConfig
+    language_config: LlamaConfig
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        vision_und_enc_config = kwargs.get("vision_und_enc_config", {})
+        self.vision_und_enc_config = VisionUnderstandEncoderConfig(
+            **vision_und_enc_config
+        )
+        vision_gen_enc_config = kwargs.get("vision_gen_enc_config", {})
+        self.vision_gen_enc_config = VisionGenerationEncoderConfig(
+            **vision_gen_enc_config
+        )
+        vision_gen_dec_config = kwargs.get("vision_gen_dec_config", {})
+        self.vision_gen_dec_config = VisionGenerationDecoderConfig(
+            **vision_gen_dec_config
+        )
+        language_config = kwargs.get("language_config", {})
+        if isinstance(language_config, LlamaConfig):
+            self.language_config = language_config
+        else:
+            self.language_config = LlamaConfig(**language_config)
+class MultiModalityPreTrainedModel(PreTrainedModel):
+    config_class = MultiModalityConfig
+    base_model_prefix = "multi_modality"
+    _no_split_modules = []
+    _skip_keys_device_placement = "past_key_values"
+class MultiModalityCausalLM(MultiModalityPreTrainedModel):
+    def __init__(self, config: MultiModalityConfig):
+        super().__init__(config)
+        # vision understanding encoder
+        vision_und_enc_config = config.vision_und_enc_config
+        vision_und_enc_cls = model_name_to_cls(vision_und_enc_config.cls)
+        self.vision_und_enc_model = vision_und_enc_cls(**vision_und_enc_config.params)
+        # vision understanding aligner
+        self.vision_und_enc_aligner = nn.Linear(1024, 2048, bias=True)
+        # begin of understanding embedding
+        self.beg_of_und_embed = nn.Parameter(torch.zeros(1, 2048))
+        # vision generation encoder
+        vision_gen_enc_config = config.vision_gen_enc_config
+        vision_gen_enc_cls = model_name_to_cls(vision_gen_enc_config.cls)
+        self.vision_gen_enc_model = vision_gen_enc_cls(**vision_gen_enc_config.params)
+        # vision generation encoder aligner
+        self.vision_gen_enc_aligner = nn.Linear(768, 2048, bias=True)
+        # vision generation decoder
+        vision_gen_dec_config = config.vision_gen_dec_config
+        vision_gen_dec_cls = model_name_to_cls(vision_gen_dec_config.cls)
+        self.vision_gen_dec_model = vision_gen_dec_cls(**vision_gen_dec_config.params)
+        # language model
+        language_config = config.language_config
+        self.language_model = LlamaForCausalLM(language_config)
+        # vision generation decoder aligner
+        self.vision_gen_dec_aligner_norm = LlamaRMSNorm(
+            2048, eps=language_config.rms_norm_eps
+        )
+        self.vision_gen_dec_aligner = nn.Linear(2048, 768, bias=True)
+    def prepare_inputs_embeds(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        images_seq_mask: torch.LongTensor,
+        images_emb_mask: torch.LongTensor,
+        **kwargs,
+    ):
+        """
+        Args:
+            input_ids (torch.LongTensor): [b, T]
+            pixel_values (torch.FloatTensor):   [b, n_images, 3, h, w]
+            images_seq_mask (torch.BoolTensor): [b, T]
+            images_emb_mask (torch.BoolTensor): [b, n_images, n_image_tokens]
+            assert torch.sum(images_seq_mask) == torch.sum(images_emb_mask)
+        Returns:
+            input_embeds (torch.Tensor): [b, T, D]
+        """
+        bs, n = pixel_values.shape[0:2]
+        images = rearrange(pixel_values, "b n c h w -> (b n) c h w")
+        # [b x n, T2, D]
+        images_embeds = self.vision_und_enc_model(images)
+        images_embeds = self.vision_und_enc_aligner(images_embeds)
+        # print(images_embeds.shape, self.beg_of_und_embed.shape, images_seq_mask.shape, input_ids.shape)
+        beg_of_und_embed = self.beg_of_und_embed[0].detach().clone()
+        images_embeds = torch.cat(
+            [
+                beg_of_und_embed.view(1, 1, -1).repeat(images_embeds.shape[0], 1, 1),
+                images_embeds,
+            ],
+            dim=1,
+        )
+        # [b x n, T2, D] -> [b, n x T2, D]
+        images_embeds = rearrange(images_embeds, "(b n) t d -> b (n t) d", b=bs, n=n)
+        # [b, n, T2] -> [b, n x T2]
+        images_emb_mask = rearrange(images_emb_mask, "b n t -> b (n t)")
+        # [b, T, D]
+        input_ids[input_ids < 0] = 0  # ignore the image embeddings
+        inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
+        # replace with the image embeddings
+        inputs_embeds[images_seq_mask] = images_embeds[images_emb_mask]
+        return inputs_embeds
+AutoConfig.register("vision_und_enc", VisionUnderstandEncoderConfig)
+AutoConfig.register("vision_gen_enc", VisionGenerationEncoderConfig)
+AutoConfig.register("vision_gen_dec", VisionGenerationDecoderConfig)
+AutoConfig.register("multi_modality", MultiModalityConfig)
+AutoModelForCausalLM.register(MultiModalityConfig, MultiModalityCausalLM)

janus/janusflow/models/processing_vlm.py ADDED Viewed

	@@ -0,0 +1,455 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from dataclasses import dataclass
+from typing import Dict, List
+import torch
+from PIL.Image import Image
+from transformers import LlamaTokenizerFast
+from transformers.processing_utils import ProcessorMixin
+from janus.janusflow.models.image_processing_vlm import VLMImageProcessor
+from janus.utils.conversation import get_conv_template
+class DictOutput(object):
+    def keys(self):
+        return self.__dict__.keys()
+    def __getitem__(self, item):
+        return self.__dict__[item]
+    def __setitem__(self, key, value):
+        self.__dict__[key] = value
+@dataclass
+class VLChatProcessorOutput(DictOutput):
+    sft_format: str
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    num_und_image_tokens: torch.IntTensor
+    def __len__(self):
+        return len(self.input_ids)
+@dataclass
+class BatchedVLChatProcessorOutput(DictOutput):
+    sft_format: List[str]
+    input_ids: torch.Tensor
+    pixel_values: torch.Tensor
+    attention_mask: torch.Tensor
+    images_seq_mask: torch.BoolTensor
+    images_emb_mask: torch.BoolTensor
+    def to(self, device, dtype=torch.bfloat16):
+        self.input_ids = self.input_ids.to(device)
+        self.attention_mask = self.attention_mask.to(device)
+        self.images_seq_mask = self.images_seq_mask.to(device)
+        self.images_emb_mask = self.images_emb_mask.to(device)
+        self.pixel_values = self.pixel_values.to(device=device, dtype=dtype)
+        return self
+class VLChatProcessor(ProcessorMixin):
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
+    attributes = ["image_processor", "tokenizer"]
+    system_prompt = (
+        "You are a helpful language and vision assistant. "
+        "You are able to understand the visual content that the user provides, "
+        "and assist the user with a variety of tasks using natural language."
+    )
+    def __init__(
+        self,
+        image_processor: VLMImageProcessor,
+        tokenizer: LlamaTokenizerFast,
+        image_tag: str = "<image_placeholder>",
+        image_start_tag: str = "<begin_of_image>",
+        image_end_tag: str = "<end_of_image>",
+        image_gen_tag: str = "<｜begin▁of▁generation｜>",
+        num_image_tokens: int = 576,
+        add_special_token: bool = False,
+        sft_format: str = "deepseek",
+        mask_prompt: bool = True,
+        ignore_id: int = -100,
+        **kwargs,
+    ):
+        self.image_processor = image_processor
+        self.tokenizer = tokenizer
+        image_id = self.tokenizer.vocab.get(image_tag)
+        if image_id is None:
+            special_tokens = [image_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            print(f"Add image tag = {image_tag} to the tokenizer")
+        image_gen_id = self.tokenizer.vocab.get(image_gen_tag)
+        if image_gen_id is None:
+            special_tokens = [image_gen_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            print(f"Add generation tag = {image_gen_tag} to the tokenizer")
+        assert image_start_tag is not None and image_end_tag is not None
+        boi_id = self.tokenizer.vocab.get(image_start_tag)
+        eoi_id = self.tokenizer.vocab.get(image_end_tag)
+        if boi_id is None:
+            special_tokens = [image_start_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            print(f"Add boi tag = {image_start_tag} to the tokenizer")
+        if eoi_id is None:
+            special_tokens = [image_end_tag]
+            special_tokens_dict = {"additional_special_tokens": special_tokens}
+            self.tokenizer.add_special_tokens(special_tokens_dict)
+            print(f"Add eoi tag = {image_end_tag} to the tokenizer")
+        self.image_tag = image_tag
+        self.image_gen_tag = image_gen_tag
+        self.image_start_tag = image_start_tag
+        self.image_end_tag = image_end_tag
+        self.num_image_tokens = num_image_tokens
+        self.add_special_token = add_special_token
+        self.sft_format = sft_format
+        self.mask_prompt = mask_prompt
+        self.ignore_id = ignore_id
+        self.tokenizer.pad_token_id = self.tokenizer.vocab.get("<｜▁pad▁｜>")
+        super().__init__(
+            image_processor,
+            tokenizer,
+            image_tag,
+            num_image_tokens,
+            add_special_token,
+            sft_format,
+            mask_prompt,
+            ignore_id,
+            **kwargs,
+        )
+    def new_chat_template(self):
+        conv = get_conv_template(self.sft_format)
+        conv.set_system_message(self.system_prompt)
+        return conv
+    def apply_sft_template_for_multi_turn_prompts(
+        self,
+        conversations: List[Dict[str, str]],
+        sft_format: str = "deepseek",
+        system_prompt: str = "",
+    ):
+        """
+        Applies the SFT template to conversation.
+        An example of conversation:
+        conversation = [
+            {
+                "role": "User",
+                "content": "<image_placeholder> is Figure 1.\n<image_placeholder> is Figure 2.\nWhich image is brighter?",
+                "images": [
+                    "./multi-images/attribute_comparison_1.png",
+                    "./multi-images/attribute_comparison_2.png"
+                ]
+            },
+            {
+                "role": "Assistant",
+                "content": ""
+            }
+        ]
+        Args:
+            conversations (List[Dict]): A conversation with a List of Dict[str, str] text.
+            sft_format (str, optional): The format of the SFT template to use. Defaults to "deepseek".
+            system_prompt (str, optional): The system prompt to use in the SFT template. Defaults to "".
+        Returns:
+            sft_prompt (str): The formatted text.
+        """
+        conv = get_conv_template(sft_format)
+        conv.set_system_message(system_prompt)
+        for message in conversations:
+            conv.append_message(message["role"], message["content"].strip())
+        sft_prompt = conv.get_prompt().strip()
+        return sft_prompt
+    @property
+    def image_token(self):
+        return self.image_tag
+    @property
+    def image_id(self):
+        image_id = self.tokenizer.vocab.get(self.image_tag)
+        return image_id
+    @property
+    def image_start_id(self):
+        image_start_id = self.tokenizer.vocab.get(self.image_start_tag)
+        return image_start_id
+    @property
+    def image_end_id(self):
+        image_end_id = self.tokenizer.vocab.get(self.image_end_tag)
+        return image_end_id
+    @property
+    def image_start_token(self):
+        return self.image_start_tag
+    @property
+    def image_end_token(self):
+        return self.image_end_tag
+    @property
+    def pad_id(self):
+        pad_id = self.tokenizer.pad_token_id
+        if pad_id is None:
+            pad_id = self.tokenizer.eos_token_id
+        return pad_id
+    @property
+    def image_gen_id(self):
+        image_gen_id = self.tokenizer.vocab.get(self.image_gen_tag)
+        return image_gen_id
+    def add_image_token(
+        self,
+        image_indices: List[int],
+        input_ids: torch.LongTensor,
+    ):
+        """
+        Args:
+            image_indices (List[int]): [index_0, index_1, ..., index_j]
+            input_ids (torch.LongTensor): [N]
+        Returns:
+            input_ids (torch.LongTensor): [N + image tokens]
+            num_image_tokens (torch.IntTensor): [n_images]
+        """
+        input_slices = []
+        start = 0
+        for index in image_indices:
+            if self.add_special_token:
+                end = index + 1
+            else:
+                end = index
+            # original text tokens
+            input_slices.append(input_ids[start:end])
+            # add boi, image tokens, eoi and set the mask as False
+            input_slices.append(self.image_start_id * torch.ones((1), dtype=torch.long))
+            input_slices.append(
+                self.image_id * torch.ones((self.num_image_tokens,), dtype=torch.long)
+            )
+            input_slices.append(self.image_end_id * torch.ones((1), dtype=torch.long))
+            start = index + 1
+        # the left part
+        input_slices.append(input_ids[start:])
+        # concat all slices
+        input_ids = torch.cat(input_slices, dim=0)
+        num_image_tokens = torch.IntTensor(
+            [self.num_image_tokens + 1] * len(image_indices)
+        )
+        # we add 1 to fit generation
+        return input_ids, num_image_tokens
+    def process_one(
+        self,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            **kwargs:
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - target_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+        assert (
+            prompt is None or conversations is None
+        ), "prompt and conversations cannot be used at the same time."
+        if prompt is None:
+            # apply sft format
+            sft_format = self.apply_sft_template_for_multi_turn_prompts(
+                conversations=conversations,
+                sft_format=self.sft_format,
+                system_prompt=self.system_prompt,
+            )
+        else:
+            sft_format = prompt
+        # tokenize
+        input_ids = self.tokenizer.encode(sft_format)
+        input_ids = torch.LongTensor(input_ids)
+        # add image tokens to the input_ids
+        image_token_mask: torch.BoolTensor = input_ids == self.image_id
+        image_indices = image_token_mask.nonzero()
+        input_ids, num_und_image_tokens = self.add_image_token(
+            image_indices=image_indices,
+            input_ids=input_ids,
+        )
+        # load images
+        images_outputs = self.image_processor(images, return_tensors="pt")
+        prepare = VLChatProcessorOutput(
+            sft_format=sft_format,
+            input_ids=input_ids,
+            pixel_values=images_outputs.pixel_values,
+            num_und_image_tokens=num_und_image_tokens,
+        )
+        return prepare
+    def __call__(
+        self,
+        *,
+        prompt: str = None,
+        conversations: List[Dict[str, str]] = None,
+        images: List[Image] = None,
+        force_batchify: bool = True,
+        **kwargs,
+    ):
+        """
+        Args:
+            prompt (str): the formatted prompt;
+            conversations (List[Dict]): conversations with a list of messages;
+            images (List[ImageType]): the list of images;
+            force_batchify (bool): force batchify the inputs;
+            **kwargs:
+        Returns:
+            outputs (BaseProcessorOutput): the output of the processor,
+                - input_ids (torch.LongTensor): [N + image tokens]
+                - images (torch.FloatTensor): [n_images, 3, H, W]
+                - image_id (int): the id of the image token
+                - num_image_tokens (List[int]): the number of image tokens
+        """
+        prepare = self.process_one(
+            prompt=prompt, conversations=conversations, images=images
+        )
+        if force_batchify:
+            prepare = self.batchify([prepare])
+        return prepare
+    def batchify(
+        self, prepare_list: List[VLChatProcessorOutput]
+    ) -> BatchedVLChatProcessorOutput:
+        """
+        Preprocesses the inputs for multimodal inference.
+        Args:
+            prepare_list (List[VLChatProcessorOutput]): A list of VLChatProcessorOutput.
+        Returns:
+            BatchedVLChatProcessorOutput: A dictionary of the inputs to use for multimodal inference.
+        """
+        batch_size = len(prepare_list)
+        sft_format = []
+        n_images = []
+        seq_lens = []
+        for prepare in prepare_list:
+            # we only fill the images for understanding tasks into the mask
+            n_images.append(len(prepare.num_und_image_tokens))
+            seq_lens.append(len(prepare))
+        input_token_max_len = max(seq_lens)
+        max_n_images = max(1, max(n_images))
+        batched_input_ids = torch.full(
+            (batch_size, input_token_max_len), self.pad_id
+        ).long()  # FIXME
+        batched_attention_mask = torch.zeros((batch_size, input_token_max_len)).long()
+        batched_pixel_values = torch.zeros(
+            (batch_size, max_n_images, *self.image_processor.default_shape)
+        ).float()
+        batched_images_seq_mask = torch.zeros((batch_size, input_token_max_len)).bool()
+        batched_images_emb_mask = torch.zeros(
+            (
+                batch_size,
+                max_n_images,
+                self.num_image_tokens + 1,
+            )  # add 1 to account for <image_beg>
+        ).bool()
+        for i, prepare in enumerate(prepare_list):
+            input_ids = prepare.input_ids
+            seq_len = len(prepare)
+            n_image = len(prepare.num_und_image_tokens)
+            # left-padding
+            batched_attention_mask[i, -seq_len:] = 1
+            batched_input_ids[i, -seq_len:] = torch.LongTensor(input_ids)
+            batched_images_seq_mask[i, -seq_len:] = (input_ids == self.image_id) | (
+                input_ids == self.image_start_id
+            )
+            if n_image > 0:
+                batched_pixel_values[i, :n_image] = prepare.pixel_values
+                for j, n_image_tokens in enumerate(prepare.num_und_image_tokens):
+                    batched_images_emb_mask[i, j, :n_image_tokens] = True
+            sft_format.append(prepare.sft_format)
+        batched_prepares = BatchedVLChatProcessorOutput(
+            input_ids=batched_input_ids,
+            attention_mask=batched_attention_mask,
+            pixel_values=batched_pixel_values,
+            images_seq_mask=batched_images_seq_mask,
+            images_emb_mask=batched_images_emb_mask,
+            sft_format=sft_format,
+        )
+        return batched_prepares

janus/janusflow/models/siglip_vit.py ADDED Viewed

	@@ -0,0 +1,691 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import (
+    Callable,
+    Dict,
+    Final,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    Type,
+    Union,
+)
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from timm.layers import (
+    AttentionPoolLatent,
+    DropPath,
+    LayerType,
+    Mlp,
+    PatchDropout,
+    PatchEmbed,
+    resample_abs_pos_embed,
+)
+from timm.models._manipulate import checkpoint_seq, named_apply
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn(
+            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+            "The distribution of values may be incorrect.",
+            stacklevel=2,
+        )
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)  # noqa: E741
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.0))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0):
+    # type: (torch.Tensor, float, float, float, float) -> torch.Tensor
+    r"""The original timm.models.layers.weight_init.trunc_normal_ can not handle bfloat16 yet, here we first
+    convert the tensor to float32, apply the trunc_normal_() in float32, and then convert it back to its original dtype.
+    Fills the input Tensor with values drawn from a truncated normal distribution. The values are effectively drawn
+    from the normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    with torch.no_grad():
+        dtype = tensor.dtype
+        tensor_fp32 = tensor.float()
+        tensor_fp32 = _no_grad_trunc_normal_(tensor_fp32, mean, std, a, b)
+        tensor_dtype = tensor_fp32.to(dtype=dtype)
+        tensor.copy_(tensor_dtype)
+def init_weights(self):
+    if self.pos_embed is not None:
+        trunc_normal_(self.pos_embed, std=self.pos_embed.shape[1] ** -0.5)
+    trunc_normal_(self.latent, std=self.latent_dim**-0.5)
+def init_weights_vit_timm(module: nn.Module, name: str = "") -> None:
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+    elif hasattr(module, "init_weights"):
+        module.init_weights()
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int = 8,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        attn_drop: float = 0.0,
+        proj_drop: float = 0.0,
+        norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, "dim should be divisible by num_heads"
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim**-0.5
+        # self.fused_attn = use_fused_attn()
+        self.fused_attn = True
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop) if proj_drop > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = (
+            self.qkv(x)
+            .reshape(B, N, 3, self.num_heads, self.head_dim)
+            .permute(2, 0, 3, 1, 4)
+        )
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q,
+                k,
+                v,
+                dropout_p=self.attn_drop.p if self.training else 0.0,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: float = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        qk_norm: bool = False,
+        proj_drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values: Optional[float] = None,
+        drop_path: float = 0.0,
+        act_layer: nn.Module = nn.GELU,
+        norm_layer: nn.Module = nn.LayerNorm,
+        mlp_layer: nn.Module = Mlp,
+    ) -> None:
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_norm=qk_norm,
+            attn_drop=attn_drop,
+            proj_drop=proj_drop,
+            norm_layer=norm_layer,
+        )
+        self.ls1 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        self.mlp = mlp_layer(
+            in_features=dim,
+            hidden_features=int(dim * mlp_ratio),
+            act_layer=act_layer,
+            drop=proj_drop,
+        )
+        self.ls2 = (
+            LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        )
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.drop_path1(self.ls1(self.attn(self.norm1(x))))
+        x = x + self.drop_path2(self.ls2(self.mlp(self.norm2(x))))
+        return x
+class VisionTransformer(nn.Module):
+    """Vision Transformer
+    A PyTorch impl of : `An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale`
+        - https://arxiv.org/abs/2010.11929
+    """
+    dynamic_img_size: Final[bool]
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        num_classes: int = 1000,
+        global_pool: Literal["", "avg", "token", "map"] = "token",
+        embed_dim: int = 768,
+        depth: int = 12,
+        num_heads: int = 12,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = True,
+        qk_norm: bool = False,
+        init_values: Optional[float] = None,
+        class_token: bool = True,
+        no_embed_class: bool = False,
+        reg_tokens: int = 0,
+        pre_norm: bool = False,
+        fc_norm: Optional[bool] = None,
+        dynamic_img_size: bool = False,
+        dynamic_img_pad: bool = False,
+        drop_rate: float = 0.0,
+        pos_drop_rate: float = 0.0,
+        patch_drop_rate: float = 0.0,
+        proj_drop_rate: float = 0.0,
+        attn_drop_rate: float = 0.0,
+        drop_path_rate: float = 0.0,
+        weight_init: Literal["skip", "jax", "jax_nlhb", "moco", ""] = "",
+        embed_layer: Callable = PatchEmbed,
+        norm_layer: Optional[LayerType] = None,
+        act_layer: Optional[LayerType] = None,
+        block_fn: Type[nn.Module] = Block,
+        mlp_layer: Type[nn.Module] = Mlp,
+        ignore_head: bool = False,
+    ) -> None:
+        """
+        Args:
+            img_size: Input image size.
+            patch_size: Patch size.
+            in_chans: Number of image input channels.
+            num_classes: Mumber of classes for classification head.
+            global_pool: Type of global pooling for final sequence (default: 'token').
+            embed_dim: Transformer embedding dimension.
+            depth: Depth of transformer.
+            num_heads: Number of attention heads.
+            mlp_ratio: Ratio of mlp hidden dim to embedding dim.
+            qkv_bias: Enable bias for qkv projections if True.
+            init_values: Layer-scale init values (layer-scale enabled if not None).
+            class_token: Use class token.
+            no_embed_class: Don't include position embeddings for class (or reg) tokens.
+            reg_tokens: Number of register tokens.
+            fc_norm: Pre head norm after pool (instead of before), if None, enabled when global_pool == 'avg'.
+            drop_rate: Head dropout rate.
+            pos_drop_rate: Position embedding dropout rate.
+            attn_drop_rate: Attention dropout rate.
+            drop_path_rate: Stochastic depth rate.
+            weight_init: Weight initialization scheme.
+            embed_layer: Patch embedding layer.
+            norm_layer: Normalization layer.
+            act_layer: MLP activation layer.
+            block_fn: Transformer block layer.
+        """
+        super().__init__()
+        assert global_pool in ("", "avg", "token", "map")
+        assert class_token or global_pool != "token"
+        use_fc_norm = global_pool == "avg" if fc_norm is None else fc_norm
+        # norm_layer = get_norm_layer(norm_layer) or partial(nn.LayerNorm, eps=1e-6)
+        # act_layer = get_act_layer(act_layer) or nn.GELU
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        act_layer = nn.GELU
+        self.num_classes = num_classes
+        self.global_pool = global_pool
+        self.num_features = self.embed_dim = (
+            embed_dim  # num_features for consistency with other models
+        )
+        self.num_prefix_tokens = 1 if class_token else 0
+        self.num_prefix_tokens += reg_tokens
+        self.num_reg_tokens = reg_tokens
+        self.has_class_token = class_token
+        self.no_embed_class = (
+            no_embed_class  # don't embed prefix positions (includes reg)
+        )
+        self.dynamic_img_size = dynamic_img_size
+        self.grad_checkpointing = False
+        self.ignore_head = ignore_head
+        embed_args = {}
+        if dynamic_img_size:
+            # flatten deferred until after pos embed
+            embed_args.update(dict(strict_img_size=False, output_fmt="NHWC"))
+        self.patch_embed = embed_layer(
+            img_size=img_size,
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            bias=not pre_norm,  # disable bias if pre-norm is used (e.g. CLIP)
+            dynamic_img_pad=dynamic_img_pad,
+            **embed_args,
+        )
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = (
+            nn.Parameter(torch.zeros(1, 1, embed_dim)) if class_token else None
+        )
+        self.reg_token = (
+            nn.Parameter(torch.zeros(1, reg_tokens, embed_dim)) if reg_tokens else None
+        )
+        embed_len = (
+            num_patches if no_embed_class else num_patches + self.num_prefix_tokens
+        )
+        self.pos_embed = nn.Parameter(torch.randn(1, embed_len, embed_dim) * 0.02)
+        self.pos_drop = nn.Dropout(p=pos_drop_rate)
+        if patch_drop_rate > 0:
+            self.patch_drop = PatchDropout(
+                patch_drop_rate,
+                num_prefix_tokens=self.num_prefix_tokens,
+            )
+        else:
+            self.patch_drop = nn.Identity()
+        self.norm_pre = norm_layer(embed_dim) if pre_norm else nn.Identity()
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, depth)
+        ]  # stochastic depth decay rule
+        self.blocks = nn.Sequential(
+            *[
+                block_fn(
+                    dim=embed_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    qkv_bias=qkv_bias,
+                    qk_norm=qk_norm,
+                    init_values=init_values,
+                    proj_drop=proj_drop_rate,
+                    attn_drop=attn_drop_rate,
+                    drop_path=dpr[i],
+                    norm_layer=norm_layer,
+                    act_layer=act_layer,
+                    mlp_layer=mlp_layer,
+                )
+                for i in range(depth)
+            ]
+        )
+        self.norm = norm_layer(embed_dim) if not use_fc_norm else nn.Identity()
+        # Classifier Head
+        if global_pool == "map":
+            AttentionPoolLatent.init_weights = init_weights
+            self.attn_pool = AttentionPoolLatent(
+                self.embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                norm_layer=norm_layer,
+            )
+        else:
+            self.attn_pool = None
+        self.fc_norm = norm_layer(embed_dim) if use_fc_norm else nn.Identity()
+        self.head_drop = nn.Dropout(drop_rate)
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+        if weight_init != "skip":
+            self.init_weights(weight_init)
+    def init_weights(self, mode: Literal["jax", "jax_nlhb", "moco", ""] = "") -> None:
+        assert mode in ("jax", "jax_nlhb", "moco", "")
+        # head_bias = -math.log(self.num_classes) if "nlhb" in mode else 0.0
+        trunc_normal_(self.pos_embed, std=0.02)
+        if self.cls_token is not None:
+            nn.init.normal_(self.cls_token, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    @torch.jit.ignore
+    def no_weight_decay(self) -> Set:
+        return {"pos_embed", "cls_token", "dist_token"}
+    @torch.jit.ignore
+    def group_matcher(self, coarse: bool = False) -> Dict:
+        return dict(
+            stem=r"^cls_token|pos_embed|patch_embed",  # stem and embed
+            blocks=[(r"^blocks\.(\d+)", None), (r"^norm", (99999,))],
+        )
+    @torch.jit.ignore
+    def set_grad_checkpointing(self, enable: bool = True) -> None:
+        self.grad_checkpointing = enable
+    @torch.jit.ignore
+    def get_classifier(self) -> nn.Module:
+        return self.head
+    def reset_classifier(self, num_classes: int, global_pool=None) -> None:
+        self.num_classes = num_classes
+        if global_pool is not None:
+            assert global_pool in ("", "avg", "token", "map")
+            if global_pool == "map" and self.attn_pool is None:
+                assert (
+                    False
+                ), "Cannot currently add attention pooling in reset_classifier()."
+            elif global_pool != "map " and self.attn_pool is not None:
+                self.attn_pool = None  # remove attention pooling
+            self.global_pool = global_pool
+        self.head = (
+            nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        )
+    def _pos_embed(self, x: torch.Tensor) -> torch.Tensor:
+        if self.dynamic_img_size:
+            B, H, W, C = x.shape
+            pos_embed = resample_abs_pos_embed(
+                self.pos_embed,
+                (H, W),
+                num_prefix_tokens=0 if self.no_embed_class else self.num_prefix_tokens,
+            )
+            x = x.view(B, -1, C)
+        else:
+            pos_embed = self.pos_embed
+        to_cat = []
+        if self.cls_token is not None:
+            to_cat.append(self.cls_token.expand(x.shape[0], -1, -1))
+        if self.reg_token is not None:
+            to_cat.append(self.reg_token.expand(x.shape[0], -1, -1))
+        if self.no_embed_class:
+            # deit-3, updated JAX (big vision)
+            # position embedding does not overlap with class token, add then concat
+            x = x + pos_embed
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+        else:
+            # original timm, JAX, and deit vit impl
+            # pos_embed has entry for class token, concat then add
+            if to_cat:
+                x = torch.cat(to_cat + [x], dim=1)
+            x = x + pos_embed
+        return self.pos_drop(x)
+    def _intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+    ) -> List[torch.Tensor]:
+        outputs, num_blocks = [], len(self.blocks)
+        take_indices = set(
+            range(num_blocks - n, num_blocks) if isinstance(n, int) else n
+        )
+        # forward pass
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in take_indices:
+                outputs.append(x)
+        return outputs
+    def get_intermediate_layers(
+        self,
+        x: torch.Tensor,
+        n: Union[int, Sequence] = 1,
+        reshape: bool = False,
+        return_prefix_tokens: bool = False,
+        norm: bool = False,
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        """Intermediate layer accessor (NOTE: This is a WIP experiment).
+        Inspired by DINO / DINOv2 interface
+        """
+        # take last n blocks if n is an int, if in is a sequence, select by matching indices
+        outputs = self._intermediate_layers(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        prefix_tokens = [out[:, 0 : self.num_prefix_tokens] for out in outputs]
+        outputs = [out[:, self.num_prefix_tokens :] for out in outputs]
+        if reshape:
+            grid_size = self.patch_embed.grid_size
+            outputs = [
+                out.reshape(x.shape[0], grid_size[0], grid_size[1], -1)
+                .permute(0, 3, 1, 2)
+                .contiguous()
+                for out in outputs
+            ]
+        if return_prefix_tokens:
+            return tuple(zip(outputs, prefix_tokens))
+        return tuple(outputs)
+    def forward_features(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.patch_embed(x)
+        x = self._pos_embed(x)
+        x = self.patch_drop(x)
+        x = self.norm_pre(x)
+        if self.grad_checkpointing and not torch.jit.is_scripting():
+            x = checkpoint_seq(self.blocks, x)
+        else:
+            x = self.blocks(x)
+        x = self.norm(x)
+        return x
+    def forward_head(self, x: torch.Tensor, pre_logits: bool = False) -> torch.Tensor:
+        if self.attn_pool is not None:
+            x = self.attn_pool(x)
+        elif self.global_pool == "avg":
+            x = x[:, self.num_prefix_tokens :].mean(dim=1)
+        elif self.global_pool:
+            x = x[:, 0]  # class token
+        x = self.fc_norm(x)
+        x = self.head_drop(x)
+        return x if pre_logits else self.head(x)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.forward_features(x)
+        if not self.ignore_head:
+            x = self.forward_head(x)
+        return x
+@dataclass
+class SigLIPVisionCfg:
+    width: int = 1152
+    layers: Union[Tuple[int, int, int, int], int] = 27
+    heads: int = 16
+    patch_size: int = 14
+    image_size: Union[Tuple[int, int], int] = 336
+    global_pool: str = "map"
+    mlp_ratio: float = 3.7362
+    class_token: bool = False
+    num_classes: int = 0
+    use_checkpoint: bool = False
+SigLIP_MODEL_CONFIG = {
+    "siglip_so400m_patch14_384": {
+        "image_size": 336,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_so400m_patch14_224": {
+        "image_size": 224,
+        "patch_size": 14,
+        "width": 1152,
+        "layers": 27,
+        "heads": 16,
+        "mlp_ratio": 3.7362,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_384": {
+        "image_size": 384,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+    "siglip_large_patch16_256": {
+        "image_size": 256,
+        "patch_size": 16,
+        "width": 1024,
+        "layers": 24,
+        "heads": 16,
+        "mlp_ratio": 4,
+        "global_pool": "map",
+        "use_checkpoint": False,
+    },
+}
+def create_siglip_vit(
+    model_name: str = "siglip_so400m_patch14_384",
+    image_size: int = 384,
+    select_layer: int = -1,
+    ckpt_path: str = "",
+    **kwargs,
+):
+    assert (
+        model_name in SigLIP_MODEL_CONFIG.keys()
+    ), f"model name should be in {SigLIP_MODEL_CONFIG.keys()}"
+    vision_cfg = SigLIPVisionCfg(**SigLIP_MODEL_CONFIG[model_name])
+    if select_layer <= 0:
+        layers = min(vision_cfg.layers, vision_cfg.layers + select_layer + 1)
+    else:
+        layers = min(vision_cfg.layers, select_layer)
+    model = VisionTransformer(
+        img_size=image_size,
+        patch_size=vision_cfg.patch_size,
+        embed_dim=vision_cfg.width,
+        depth=layers,
+        num_heads=vision_cfg.heads,
+        mlp_ratio=vision_cfg.mlp_ratio,
+        class_token=vision_cfg.class_token,
+        global_pool=vision_cfg.global_pool,
+        ignore_head=kwargs.get("ignore_head", True),
+        weight_init=kwargs.get("weight_init", "skip"),
+        num_classes=0,
+    )
+    if ckpt_path:
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+        incompatible_keys = model.load_state_dict(state_dict, strict=False)
+        print(
+            f"SigLIP-ViT restores from {ckpt_path},\n"
+            f"\tincompatible_keys:', {incompatible_keys}."
+        )
+    return model

janus/janusflow/models/uvit.py ADDED Viewed

	@@ -0,0 +1,714 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+# modified from: https://github.com/lucidrains/denoising-diffusion-pytorch/blob/main/denoising_diffusion_pytorch/simple_diffusion.py
+import math
+import torch
+import torch.nn as nn
+import torch.distributed as dist
+import torch.nn.functional as F
+from typing import Optional, Tuple, Union
+import numpy as np
+import torchvision
+import torchvision.utils
+from diffusers.models.embeddings import Timesteps, TimestepEmbedding
+from transformers.models.llama.modeling_llama import LlamaRMSNorm as RMSNorm
+class ImageHead(nn.Module):
+    def __init__(self, decoder_cfg, gpt_cfg, layer_id=None):
+        super().__init__()
+        self.layer_id = layer_id
+        cfg = (
+            AttrDict(
+                norm_type="layernorm",
+                is_exp_norm=False,
+                sequence_parallel=False,
+                use_userbuffer=False,
+                norm_eps=1e-5,
+                norm_bias=True,
+                gradient_accumulation_fusion=True,
+                use_fp32_head_weight=False,
+            )
+            + gpt_cfg
+        )
+        group = PG.tensor_parallel_group()
+        assert cfg.norm_type in [
+            "layernorm",
+            "rmsnorm",
+        ], f"Norm type:{cfg.norm_type} not supported"
+        if cfg.norm_type == "rmsnorm":
+            self.norm = DropoutAddRMSNorm(
+                cfg.n_embed,
+                prenorm=False,
+                eps=cfg.norm_eps,
+                is_exp_norm=cfg.is_exp_norm,
+                sequence_parallel=cfg.sequence_parallel,
+            )
+        else:
+            self.norm = DropoutAddLayerNorm(
+                cfg.n_embed,
+                prenorm=False,
+                eps=cfg.norm_eps,
+                is_exp_norm=cfg.is_exp_norm,
+                sequence_parallel=cfg.sequence_parallel,
+                bias=cfg.norm_bias,
+            )
+        multiple_of = 256
+        if decoder_cfg.in_channels % multiple_of != 0:
+            warnings.warn(
+                f"建议把 vocab_size 设置为 {multiple_of} 的倍数, 否则会影响矩阵乘法的性能"
+            )
+        dtype = default_dtype = torch.get_default_dtype()
+        if cfg.use_fp32_head_weight:
+            dtype = torch.float32
+            print(
+                "使用 fp32 head weight!!!! 与原来的 bf16 head weight 不兼容\n",
+                end="",
+                flush=True,
+            )
+        torch.set_default_dtype(dtype)
+        self.head = ColumnParallelLinear(
+            cfg.n_embed,
+            decoder_cfg.in_channels,
+            bias=True,
+            group=group,
+            sequence_parallel=cfg.sequence_parallel,
+            use_userbuffer=cfg.use_userbuffer,
+            gradient_accumulation_fusion=cfg.gradient_accumulation_fusion,
+            use_fp32_output=False,
+        )
+        torch.set_default_dtype(default_dtype)
+        self.use_fp32_head_weight = cfg.use_fp32_head_weight
+    def forward(
+        self, input_args, images_split_mask: Optional[torch.BoolTensor] = None, **kwargs
+    ):
+        residual = None
+        if isinstance(input_args, tuple):
+            x, residual = input_args
+        else:
+            x = input_args
+        x = self.norm(x, residual)
+        if self.use_fp32_head_weight:
+            assert (
+                self.head.weight.dtype == torch.float32
+            ), f"head.weight is {self.head.weight.dtype}"
+            x = x.float()
+        if images_split_mask is None:
+            logits = self.head(x)
+        else:
+            bs, n_images = images_split_mask.shape[:2]
+            n_embed = x.shape[-1]
+            images_embed = torch.masked_select(
+                x.unsqueeze(1), images_split_mask.unsqueeze(-1)
+            )
+            images_embed = images_embed.view((bs * n_images, -1, n_embed))
+            logits = self.head(images_embed)
+        return logits
+class GlobalResponseNorm(nn.Module):
+    # Taken from https://github.com/facebookresearch/ConvNeXt-V2/blob/3608f67cc1dae164790c5d0aead7bf2d73d9719b/models/utils.py#L105
+    def __init__(self, dim):
+        super().__init__()
+        self.weight = nn.Parameter(torch.zeros(1, 1, 1, dim))
+        self.bias = nn.Parameter(torch.zeros(1, 1, 1, dim))
+    def forward(self, x):
+        gx = torch.norm(x, p=2, dim=(1, 2), keepdim=True)
+        nx = gx / (gx.mean(dim=-1, keepdim=True) + 1e-6)
+        return torch.addcmul(self.bias, (self.weight * nx + 1), x, value=1)
+class Downsample2D(nn.Module):
+    """A 2D downsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        padding (`int`, default `1`):
+            padding for the convolution.
+        name (`str`, default `conv`):
+            name of the downsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        out_channels: Optional[int] = None,
+        padding: int = 1,
+        name: str = "conv",
+        kernel_size=3,
+        stride=2,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        self.name = name
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+        if use_conv:
+            conv = nn.Conv2d(
+                self.channels,
+                self.out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                bias=bias,
+            )
+        else:
+            assert self.channels == self.out_channels
+            conv = nn.AvgPool2d(kernel_size=stride, stride=stride)
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+    def forward(self, hidden_states: torch.Tensor, *args, **kwargs) -> torch.Tensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(
+                0, 3, 1, 2
+            )
+        if self.use_conv and self.padding == 0:
+            pad = (0, 1, 0, 1)
+            hidden_states = F.pad(hidden_states, pad, mode="constant", value=0)
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class Upsample2D(nn.Module):
+    """A 2D upsampling layer with an optional convolution.
+    Parameters:
+        channels (`int`):
+            number of channels in the inputs and outputs.
+        use_conv (`bool`, default `False`):
+            option to use a convolution.
+        use_conv_transpose (`bool`, default `False`):
+            option to use a convolution transpose.
+        out_channels (`int`, optional):
+            number of output channels. Defaults to `channels`.
+        name (`str`, default `conv`):
+            name of the upsampling 2D layer.
+    """
+    def __init__(
+        self,
+        channels: int,
+        use_conv: bool = False,
+        use_conv_transpose: bool = False,
+        out_channels: Optional[int] = None,
+        name: str = "conv",
+        kernel_size: Optional[int] = None,
+        padding=1,
+        stride=2,
+        norm_type=None,
+        eps=None,
+        elementwise_affine=None,
+        bias=True,
+        interpolate=True,
+    ):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        self.interpolate = interpolate
+        self.stride = stride
+        if norm_type == "ln_norm":
+            self.norm = nn.LayerNorm(channels, eps, elementwise_affine)
+        elif norm_type == "rms_norm":
+            self.norm = RMSNorm(channels, eps)
+        elif norm_type is None:
+            self.norm = None
+        else:
+            raise ValueError(f"unknown norm_type: {norm_type}")
+        conv = None
+        if use_conv_transpose:
+            if kernel_size is None:
+                kernel_size = 4
+            conv = nn.ConvTranspose2d(
+                channels,
+                self.out_channels,
+                kernel_size=kernel_size,
+                stride=stride,
+                padding=padding,
+                bias=bias,
+            )
+        elif use_conv:
+            if kernel_size is None:
+                kernel_size = 3
+            conv = nn.Conv2d(
+                self.channels,
+                self.out_channels,
+                kernel_size=kernel_size,
+                padding=padding,
+                bias=bias,
+            )
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_size: Optional[int] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        assert hidden_states.shape[1] == self.channels
+        if self.norm is not None:
+            hidden_states = self.norm(hidden_states.permute(0, 2, 3, 1)).permute(
+                0, 3, 1, 2
+            )
+        if self.use_conv_transpose:
+            return self.conv(hidden_states)
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+        # https://github.com/pytorch/pytorch/issues/86679
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if self.interpolate:
+            if output_size is None:
+                hidden_states = F.interpolate(
+                    hidden_states, scale_factor=self.stride, mode="nearest"
+                )
+            else:
+                hidden_states = F.interpolate(
+                    hidden_states, size=output_size, mode="nearest"
+                )
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+        return hidden_states
+class ConvNextBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        norm_eps,
+        elementwise_affine,
+        use_bias,
+        hidden_dropout,
+        hidden_size,
+        res_ffn_factor: int = 4,
+    ):
+        super().__init__()
+        self.depthwise = nn.Conv2d(
+            channels,
+            channels,
+            kernel_size=7,
+            padding=3,
+            groups=channels,
+            bias=use_bias,
+        )
+        self.norm = RMSNorm(channels, norm_eps)
+        self.channelwise_linear_1 = nn.Linear(
+            channels, int(channels * res_ffn_factor), bias=use_bias
+        )
+        self.channelwise_act = nn.GELU()
+        self.channelwise_norm = GlobalResponseNorm(int(channels * res_ffn_factor))
+        self.channelwise_linear_2 = nn.Linear(
+            int(channels * res_ffn_factor), channels, bias=use_bias
+        )
+        self.channelwise_dropout = nn.Dropout(hidden_dropout)
+        self.cond_embeds_mapper = nn.Linear(hidden_size, channels * 2, use_bias)
+    def forward(self, x, cond_embeds):
+        x_res = x
+        x = self.depthwise(x)
+        x = x.permute(0, 2, 3, 1)
+        x = self.norm(x)
+        x = self.channelwise_linear_1(x)
+        x = self.channelwise_act(x)
+        x = self.channelwise_norm(x)
+        x = self.channelwise_linear_2(x)
+        x = self.channelwise_dropout(x)
+        x = x.permute(0, 3, 1, 2)
+        x = x + x_res
+        scale, shift = self.cond_embeds_mapper(F.silu(cond_embeds)).chunk(2, dim=1)
+        # x = x * (1 + scale[:, :, None, None]) + shift[:, :, None, None]
+        x = torch.addcmul(
+            shift[:, :, None, None], x, (1 + scale)[:, :, None, None], value=1
+        )
+        return x
+class Patchify(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        block_out_channels,
+        patch_size,
+        bias,
+        elementwise_affine,
+        eps,
+        kernel_size=None,
+    ):
+        super().__init__()
+        if kernel_size is None:
+            kernel_size = patch_size
+        self.patch_conv = nn.Conv2d(
+            in_channels,
+            block_out_channels,
+            kernel_size=kernel_size,
+            stride=patch_size,
+            bias=bias,
+        )
+        self.norm = RMSNorm(block_out_channels, eps)
+    def forward(self, x):
+        embeddings = self.patch_conv(x)
+        embeddings = embeddings.permute(0, 2, 3, 1)
+        embeddings = self.norm(embeddings)
+        embeddings = embeddings.permute(0, 3, 1, 2)
+        return embeddings
+class Unpatchify(nn.Module):
+    def __init__(
+        self, in_channels, out_channels, patch_size, bias, elementwise_affine, eps
+    ):
+        super().__init__()
+        self.norm = RMSNorm(in_channels, eps)
+        self.unpatch_conv = nn.Conv2d(
+            in_channels,
+            out_channels * patch_size * patch_size,
+            kernel_size=1,
+            bias=bias,
+        )
+        self.pixel_shuffle = nn.PixelShuffle(patch_size)
+        self.patch_size = patch_size
+    def forward(self, x):
+        # [b, c, h, w]
+        x = x.permute(0, 2, 3, 1)
+        x = self.norm(x)
+        x = x.permute(0, 3, 1, 2)
+        x = self.unpatch_conv(x)
+        x = self.pixel_shuffle(x)
+        return x
+class UVitBlock(nn.Module):
+    def __init__(
+        self,
+        channels,
+        out_channels,
+        num_res_blocks,
+        stride,
+        hidden_size,
+        hidden_dropout,
+        elementwise_affine,
+        norm_eps,
+        use_bias,
+        downsample: bool,
+        upsample: bool,
+        res_ffn_factor: int = 4,
+        seq_len=None,
+        concat_input=False,
+        original_input_channels=None,
+        use_zero=True,
+        norm_type="RMS",
+    ):
+        super().__init__()
+        self.res_blocks = nn.ModuleList()
+        for i in range(num_res_blocks):
+            conv_block = ConvNextBlock(
+                channels,
+                norm_eps,
+                elementwise_affine,
+                use_bias,
+                hidden_dropout,
+                hidden_size,
+                res_ffn_factor=res_ffn_factor,
+            )
+            self.res_blocks.append(conv_block)
+        if downsample:
+            self.downsample = Downsample2D(
+                channels=channels,
+                out_channels=out_channels,
+                use_conv=True,
+                name="Conv2d_0",
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                norm_type="rms_norm",
+                eps=norm_eps,
+                elementwise_affine=elementwise_affine,
+                bias=use_bias,
+            )
+        else:
+            self.downsample = None
+        if upsample:
+            self.upsample = Upsample2D(
+                channels=channels,
+                out_channels=out_channels,
+                use_conv_transpose=False,
+                use_conv=True,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                name="conv",
+                norm_type="rms_norm",
+                eps=norm_eps,
+                elementwise_affine=elementwise_affine,
+                bias=use_bias,
+                interpolate=True,
+            )
+        else:
+            self.upsample = None
+    def forward(self, x, emb, recompute=False):
+        for res_block in self.res_blocks:
+            x = res_block(x, emb)
+        if self.downsample is not None:
+            x = self.downsample(x)
+        if self.upsample is not None:
+            x = self.upsample(x)
+        return x
+class ShallowUViTEncoder(nn.Module):
+    def __init__(
+        self,
+        input_channels=3,
+        stride=4,
+        kernel_size=7,
+        padding=None,
+        block_out_channels=(768,),
+        layers_in_middle=2,
+        hidden_size=2048,
+        elementwise_affine=True,
+        use_bias=True,
+        norm_eps=1e-6,
+        dropout=0.0,
+        use_mid_block=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.time_proj = Timesteps(
+            block_out_channels[0], flip_sin_to_cos=True, downscale_freq_shift=0
+        )
+        self.time_embed = TimestepEmbedding(
+            block_out_channels[0], hidden_size, sample_proj_bias=use_bias
+        )
+        if padding is None:
+            padding = math.ceil(kernel_size - stride)
+        self.in_conv = nn.Conv2d(
+            in_channels=input_channels,
+            out_channels=block_out_channels[0],
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+        )
+        if use_mid_block:
+            self.mid_block = UVitBlock(
+                block_out_channels[-1],
+                block_out_channels[-1],
+                num_res_blocks=layers_in_middle,
+                hidden_size=hidden_size,
+                hidden_dropout=dropout,
+                elementwise_affine=elementwise_affine,
+                norm_eps=norm_eps,
+                use_bias=use_bias,
+                downsample=False,
+                upsample=False,
+                stride=1,
+                res_ffn_factor=4,
+            )
+        else:
+            self.mid_block = None
+    def get_num_extra_tensors(self):
+        return 2
+    def forward(self, x, timesteps):
+        bs = x.shape[0]
+        dtype = x.dtype
+        t_emb = self.time_proj(timesteps.flatten()).view(bs, -1).to(dtype)
+        t_emb = self.time_embed(t_emb)
+        x_emb = self.in_conv(x)
+        if self.mid_block is not None:
+            x_emb = self.mid_block(x_emb, t_emb)
+        hs = [x_emb]
+        return x_emb, t_emb, hs
+class ShallowUViTDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=768,
+        out_channels=3,
+        block_out_channels: Tuple[int] = (768,),
+        upsamples=2,
+        layers_in_middle=2,
+        hidden_size=2048,
+        elementwise_affine=True,
+        norm_eps=1e-6,
+        use_bias=True,
+        dropout=0.0,
+        use_mid_block=True,
+        **kwargs,
+    ):
+        super().__init__()
+        if use_mid_block:
+            self.mid_block = UVitBlock(
+                in_channels + block_out_channels[-1],
+                block_out_channels[
+                    -1
+                ],  # In fact, the parameter is not used because it has no effect when both downsample and upsample are set to false.
+                num_res_blocks=layers_in_middle,
+                hidden_size=hidden_size,
+                hidden_dropout=dropout,
+                elementwise_affine=elementwise_affine,
+                norm_eps=norm_eps,
+                use_bias=use_bias,
+                downsample=False,
+                upsample=False,
+                stride=1,
+                res_ffn_factor=4,
+            )
+        else:
+            self.mid_block = None
+        self.out_convs = nn.ModuleList()
+        for rank in range(upsamples):
+            if rank == upsamples - 1:
+                curr_out_channels = out_channels
+            else:
+                curr_out_channels = block_out_channels[-1]
+            if rank == 0:
+                curr_in_channels = block_out_channels[-1] + in_channels
+            else:
+                curr_in_channels = block_out_channels[-1]
+            self.out_convs.append(
+                Unpatchify(
+                    curr_in_channels,
+                    curr_out_channels,
+                    patch_size=2,
+                    bias=use_bias,
+                    elementwise_affine=elementwise_affine,
+                    eps=norm_eps,
+                )
+            )
+        self.input_norm = RMSNorm(in_channels, norm_eps)
+    def forward(self, x, hs, t_emb):
+        x = x.permute(0, 2, 3, 1)
+        x = self.input_norm(x)
+        x = x.permute(0, 3, 1, 2)
+        x = torch.cat([x, hs.pop()], dim=1)
+        if self.mid_block is not None:
+            x = self.mid_block(x, t_emb)
+        for out_conv in self.out_convs:
+            x = out_conv(x)
+        assert len(hs) == 0
+        return x

janus/models/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) 2023-2024 DeepSeek.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+from .image_processing_vlm import VLMImageProcessor
+from .modeling_vlm import MultiModalityCausalLM
+from .processing_vlm import VLChatProcessor
+__all__ = [
+    "VLMImageProcessor",
+    "VLChatProcessor",
+    "MultiModalityCausalLM",
+]