diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..5febc5f8c6599b10f1132897925c305eee622d5e 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +data/IT_data/T-T+X_data/audio_t2x.json filter=lfs diff=lfs merge=lfs -text +data/IT_data/T-T+X_data/image_t2x.json filter=lfs diff=lfs merge=lfs -text +data/IT_data/T-T+X_data/video_t2x.json filter=lfs diff=lfs merge=lfs -text +figures/demo.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..a03164857dacfceb0de215a15e40fa2612cd9fcd --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +.idea \ No newline at end of file diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..7059d4adb9115845347d574009a98ecc914cb7ee --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,13 @@ +BSD 3-Clause License + +Copyright 2023 Shengqiong Wu All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. \ No newline at end of file diff --git a/README.md b/README.md index e7d90ddb789e264c5ea4a99dccd0b911883df4c5..69259f3750de7bd025535229075aba1c20992db5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,417 @@ ---- -license: unknown ---- +# NExT-GPT: Any-to-Any Multimodal LLM +[Shengqiong Wu](https://chocowu.github.io/), [Hao Fei](http://haofei.vip/)*, [Leigang Qu](#), [Wei Ji](https://jiwei0523.github.io/), and [Tat-Seng Chua](https://www.chuatatseng.com/). +(*Correspondence ) + +**[NExT++](https://www.nextcenter.org/), School of Computing, National University of Singapore** + +----- + + + + +![License](https://img.shields.io/badge/License-BSD-blue.svg) +[![YouTube](https://badges.aleen42.com/src/youtube.svg)](https://www.youtube.com/watch?v=aqw2SCWeWD0) + + +This repository hosts the code, data and model weight of **NExT-GPT**, the first end-to-end MM-LLM that perceives input and generates output in arbitrary combinations (any-to-any) of text, image, video, and audio and beyond. + + + +----------- + +## 🎉 News + +- [x] [2023.09.15] 🚀🚀 Release the code of NExT-GPT in version `7b_tiva_v0`. +- [x] [2023.09.27] 🔨🧩 Added modality-blended batch sampler . +- [x] [2023.10.01] 📢📢 Release the T2M instruction dataset. +- [x] [2023.10.04] 👏👏 Release the checkpoint of NExT-GPT in version [7b_tiva_v0](https://huggingface.co/ChocoWu/nextgpt_7b_tiva_v0) . +- [x] [2023.10.15] 🔨🚀 Update of NExT-GPT in version [7b_tiva_v0](https://huggingface.co/ChocoWu/nextgpt_7b_tiva_v0) . + + +## 👉 TODO +- [ ] Release MosIT data. +- [ ] Updating NExT-GPT in more types&sizes of LLMs. +- [ ] Empowering NExT-GPT with more modalities of inputs&outputs. +- [ ] ... + + + +----------- + +## Example Demos +Here we showcase examples generated from NExT-GPT. +For more examples, kindly visit the [webpage](https://next-gpt.github.io/), or the online live [demo](https://acc414b22d6839d28f.gradio.live). + + +https://github.com/NExT-GPT/NExT-GPT/assets/18722770/0c2b3d88-a533-4899-ab44-65580fe54538 + + +https://github.com/NExT-GPT/NExT-GPT/assets/18722770/eb1319a6-38aa-4546-a96e-163207e7de93 + + +https://github.com/NExT-GPT/NExT-GPT/assets/18722770/36bec0ad-9bad-4bcf-bc37-92b028f1bc6a + + + + + +## Brief Introduction + + +NExt-GPT is built on top of existing pre-trained LLM, multimodal encoder and SoTA diffusion models, with sufficient end-to-end instruction tuning. + +

+Video-LLaMA +

+ +- **Multimodal Encoding Stage.** Leveraging established encoders to encode inputs in various modalities, where these representations are projected into language-like representations comprehensible to the LLM through a projection layer. +- **LLM Understanding and Reasoning Stage.** Harnessing an existing open-sourced LLM as the core to process input information for semantic understanding and reasoning. The LLM not only directly generates text tokens but also produces unique “modality signal” tokens that serve as instructions to dictate the decoding layers whether & what modal content to output correspondingly. +- **Multimodal Generation Stage.** Receiving the multimodal signals with specific instructions from LLM (if any), the Transformer-based output projection layers map the signal token representations into the ones that are understandable to following multimodal decoders. + + +For more technical details, kindly refer to the [paper](https://arxiv.org/pdf/2309.05519.pdf). + + +----------- + + + + +## Getting Started + + + + + +### Table of Contents: +* 1. Code Structure +* 2. Environment Preparation +* 3. Training/Adapting NExt-GPT on Your Own + * 3.1. Preparing Pre-trained Checkpoint + * 3.2. Preparing Dataset + * 3.3. Precomputing Embeddings + * 3.4. Training NExT-GPT +* 4. Running NExT-GPT System + * 4.1. Preparing checkpoints + * 4.2. Deploying Demo System + +**** + + + + + + + +### 1. Code Structure + +``` +├── figures +├── data +│ ├── T-X_pair_data +│ │ ├── audiocap # text-autio pairs data +│ │ │ ├── audios # audio files +│ │ │ └── audiocap.json # the audio captions +│ │ ├── cc3m # text-image paris data +│ │ │ ├── images # image files +│ │ │ └── cc3m.json # the image captions +│ │ └── webvid # text-video pairs data +│ │ │ ├── videos # video files +│ │ │ └── webvid.json # the video captions +│ ├── IT_data # instruction data +│ │ ├── T+X-T_data # text+[image/audio/video] to text instruction data +│ │ │ ├── alpaca # textual instruction data +│ │ │ ├── llava # visual instruction data +│ │ ├── T-T+X # synthesized text to text+[image/audio/video] instruction data +│ │ └── MosIT # Modality-switching Instruction Tuning instruction data +├── code +│ ├── config +│ │ ├── base.yaml # the model configuration +│ │ ├── stage_1.yaml # enc-side alignment training configuration +│ │ ├── stage_2.yaml # dec-side alignment training configuration +│ │ └── stage_3.yaml # instruction-tuning configuration +│ ├── dsconfig +│ │ ├── stage_1.json # deepspeed configuration for enc-side alignment training +│ │ ├── stage_2.json # deepspeed configuration for dec-side alignment training +│ │ └── stage_3.json # deepspeed configuration for instruction-tuning training +│ ├── datast +│ │ ├── base_dataset.py +│ │ ├── catalog.py # the catalog information of the dataset +│ │ ├── cc3m_datast.py # process and load text-image pair dataset +│ │ ├── audiocap_datast.py # process and load text-audio pair dataset +│ │ ├── webvid_dataset.py # process and load text-video pair dataset +│ │ ├── T+X-T_instruction_dataset.py # process and load text+x-to-text instruction dataset +│ │ ├── T-T+X_instruction_dataset.py # process and load text-to-text+x instruction dataset +│ │ └── concat_dataset.py # process and load multiple dataset +│ ├── model +│ │ ├── ImageBind # the code from ImageBind Model +│ │ ├── common +│ │ ├── anyToImageVideoAudio.py # the main model file +│ │ ├── agent.py +│ │ ├── modeling_llama.py +│ │ ├── custom_ad.py # the audio diffusion +│ │ ├── custom_sd.py # the image diffusion +│ │ ├── custom_vd.py # the video diffusion +│ │ ├── layers.py # the output projection layers +│ │ └── ... +│ ├── scripts +│ │ ├── train.sh # training NExT-GPT script +│ │ └── app.sh # deploying demo script +│ ├── header.py +│ ├── process_embeddings.py # precompute the captions embeddings +│ ├── train.py # training +│ ├── inference.py # inference +│ ├── demo_app.py # deploy Gradio demonstration +│ └── ... +├── ckpt +│ ├── delta_ckpt # tunable NExT-GPT params +│ │ ├── nextgpt +│ │ │ ├── 7b_tiva_v0 # the directory to save the log file +│ │ │ │ ├── log # the logs +│ └── ... +│ ├── pretrained_ckpt # frozen params of pretrained modules +│ │ ├── imagebind_ckpt +│ │ │ ├──huge # version +│ │ │ │ └──imagebind_huge.pth +│ │ ├── vicuna_ckpt +│ │ │ ├── 7b_v0 # version +│ │ │ │ ├── config.json +│ │ │ │ ├── pytorch_model-00001-of-00002.bin +│ │ │ │ ├── tokenizer.model +│ │ │ │ └── ... +├── LICENCE.md +├── README.md +└── requirements.txt +``` + + + + + +### 2. Environment Preparation [Back to Top] +Please first clone the repo and install the required environment, which can be done by running the following commands: +``` +conda env create -n nextgpt python=3.8 + +conda activate nextgpt + +# CUDA 11.6 +conda install pytorch==1.13.1 torchvision==0.14.1 torchaudio==0.13.1 pytorch-cuda=11.6 -c pytorch -c nvidia + +git clone https://github.com/NExT-GPT/NExT-GPT.git +cd NExT-GPT + +pip install -r requirements.txt +``` + + + +### 3. Training/Adapting NExt-GPT on Your Own + +#### + + + + + +#### 3.1. Preparing Pre-trained Checkpoint [Back to Top] +NExT-GPT is trained based on following excellent existing models. +Please follow the instructions to prepare the checkpoints. + +- `ImageBind` +is the unified image/video/audio encoder. The pre-trained checkpoint can be downloaded from [here](https://dl.fbaipublicfiles.com/imagebind/imagebind_huge.pth) with version `huge`. Afterward, put the `imagebind_huge.pth` file at [[./ckpt/pretrained_ckpt/imagebind_ckpt/huge]](ckpt/pretrained_ckpt/imagebind_ckpt/). +- `Vicuna`: +first prepare the LLaMA by following the instructions [[here]](ckpt/pretrained_ckpt/prepare_vicuna.md). Then put the pre-trained model at [[./ckpt/pretrained_ckpt/vicuna_ckpt/]](ckpt/pretrained_ckpt/vicuna_ckpt/). +- `Image Diffusion` +is used to generate images. NExT-GPT uses [Stable Diffusion](https://huggingface.co/runwayml/stable-diffusion-v1-5) with version ` +v1-5`. (_will be automatically downloaded_) +- `Audio Diffusion` +for producing audio content. NExT-GPT employs [AudioLDM](https://github.com/haoheliu/AudioLDM) with version `l-full`. (_will be automatically downloaded_) +- `Video Diffusion` +for the video generation. We employ [ZeroScope](https://huggingface.co/cerspense/zeroscope_v2_576w) with version `v2_576w`. (_will be automatically downloaded_) + + + + + +#### 3.2. Preparing Dataset [Back to Top] +Please download the following datasets used for model training: + +A) T-X pairs data + - `CC3M` of ***text-image*** pairs, please follow this instruction [[here]](./data/T-X_pair_data/cc3m/prepare.md). Then put the data at [[./data/T-X_pair_data/cc3m]](./data/T-X_pair_data/cc3m). + - `WebVid` of ***text-video*** pairs, see the [[instruction]](./data/T-X_pair_data/webvid/prepare.md). The file should be saved at [[./data/T-X_pair_data/webvid]](./data/T-X_pair_data/webvid). + - `AudioCap` of ***text-audio*** pairs, see the [[instruction]](./data/T-X_pair_data/audiocap/prepare.md). Save the data in [[./data/T-X_pair_data/audiocap]](./data/T-X_pair_data/audiocap). + +B) Instruction data + - T+X-T + - `LLaVA` of the ***visual instruction data***, download it from [here](https://github.com/haotian-liu/LLaVA/blob/main/docs/Data.md), and then put it at [[./data/IT_data/T+X-T_data/llava]](./data/IT_data/T+X-T_data/llava/). + - `Alpaca` of the ***textual instruction data***, download it from [here](https://github.com/tatsu-lab/stanford_alpaca), and then put it at [[./data/IT_data/T+X-T_data/alpaca/]](data/IT_data/T+X-T_data/alpaca/). + - `VideoChat`, download the ***video instruction data*** [here](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data), and then put it at [[./data/IT_data/T+X-T_data/videochat/]](data/IT_data/T+X-T_data/videochat/). + + Side note:After downloading dataset, please run `preprocess_dataset.py` to preprocess the dataset into a unified format. + - T-X+T (T2M) + - The `T-X+T` instruction datasets (T2M) are saved at [[./data/IT_data/T-T+X_data]](./data/IT_data/T-T+X_data). + + - MosIT + - Download the file from [here](), put them in [[./data/IT_data/MosIT_data/]](./data/IT_data/MosIT_data/). (_We are in the process of finalizing the data and handling the copyright issue. Will release later._) + + + + +#### 3.3. Precomputing Embeddings [Back to Top] +In decoding-side alignment training, we minimize the distance between the representation of signal tokens and captions. +To save costs of time and memory, we precompute the text embeddings for image, audio and video captions using the text encoder within the respective diffusion models. + +Please run this command before the following training of NExT-GPT, where the produced `embedding` file will be saved at [[./data/embed]](./data/embed). +```angular2html +cd ./code/ +python process_embeddings.py ../data/T-X_pair_data/cc3m/cc3m.json image ../data/embed/ runwayml/stable-diffusion-v1-5 +``` + +Note of arguments: +- args[1]: path of caption file; +- args[2]: modality, which can be `image`, `video`, and `audio`; +- args[3]: saving path of embedding file; +- args[4]: corresponding pre-trained diffusion model name. + + + + + +#### 3.4. Training NExT-GPT [Back to Top] + +First of all, please refer to the base configuration file [[./code/config/base.yaml]](./code/config/base.yaml) for the basic system setting of overall modules. + +Then, the training of NExT-GPT starts with this script: +```angular2html +cd ./code +bash scripts/train.sh +``` +Specifying the command: +```angular2html +deepspeed --include localhost:0 --master_addr 127.0.0.1 --master_port 28459 train.py \ + --model nextgpt \ + --stage 1\ + --save_path ../ckpt/delta_ckpt/nextgpt/7b_tiva_v0/\ + --log_path ../ckpt/delta_ckpt/nextgpt/7b_tiva_v0/log/ +``` +where the key arguments are: +- `--include`: `localhost:0` indicating the GPT cuda number `0` of deepspeed. +- `--stage`: training stage. +- `--save_path`: the directory which saves the trained delta weights. This directory will be automatically created. +- `--log_path`: the directory which saves the log file. + + + + + + +The whole NExT-GPT training involves 3 steps: + +- **Step-1**: Encoding-side LLM-centric Multimodal Alignment. This stage trains the ***input projection layer*** while freezing the ImageBind, LLM, output projection layer. + + Just run the above `train.sh` script by setting: `--stage 1` + + Also refer to the running config file [[./code/config/stage_1.yaml]](./code/config/stage_1.yaml) and deepspeed config file [[./code/dsconfig/stage_1.yaml]](./code/dsconfig/stage_1.yaml) for more step-wise configurations. + + Note that the dataset used for training in this step is included `dataset_name_list` and the dataset name must precisely match the definition in [[./code/dataset/catalog.py]](./code/dataset/catalog.py) + + + +- **Step-2**: Decoding-side Instruction-following Alignment. This stage trains the ***output projection layers*** while freezing the ImageBind, LLM, input projection layers. + + Just run the above `train.sh` script by setting: `--stage 2` + + Also refer to the running config file [[./code/config/stage_2.yaml]](./code/config/stage_2.yaml) and deepspeed config file [[./code/dsconfig/stage_2.yaml]](./code/dsconfig/stage_2.yaml) for more step-wise configurations. + + + + + +- **Step-3**: Instruction Tuning. This stage instruction-tune 1) the ***LLM*** via LoRA, 2) ***input projection layer*** and 3) ***output projection layer*** on the instruction dataset. + + Just run the above `train.sh` script by setting: `--stage 3` + + Also refer to the running config file [[./code/config/stage_3.yaml]](./code/config/stage_3.yaml) and deepspeed config file [[./code/dsconfig/stage_3.yaml]](./code/dsconfig/stage_3.yaml) for more step-wise configurations. + + + + + + +## 4. Running NExT-GPT System [Back to Top] + + + + + +#### 4.1. Preparing Checkpoints + +First, loading the pre-trained NExT-GPT system. +- **Step-1**: load `Frozen parameters`. Please refer to 3.1 Preparing Pre-trained Checkpoint. + +- **Step-2**: load `Tunable parameters`. Please put the NExT-GPT system at [[./ckpt/delta_ckpt/nextgpt/7b_tiva_v0]](./ckpt/delta_ckpt/nextgpt/7b_tiva_v0). You may either 1) use the params trained yourselves, or 2) download our checkpoints from [Huggingface](https://huggingface.co/ChocoWu/nextgpt_7b_tiva_v0). + + + + + +#### 4.2. Deploying Gradio Demo +Upon completion of the checkpoint loading, you can run the demo locally via: +```angular2html +cd ./code +bash scripts/app.sh +``` +Specifying the key arguments as: +- `--nextgpt_ckpt_path`: the path of pre-trained NExT-GPT params. + +--------- + + +## Contact + +For any questions or feedback, feel free to contact [Shengqiong Wu](mailto:swu@u.nus.edu) and [Hao Fei](mailto:haofei37@nus.edu.sg). + + +## Citation + +If you find NextGPT useful in your research or applications, please kindly cite: +``` +@articles{wu2023nextgpt, + title={NExT-GPT: Any-to-Any Multimodal LLM}, + author={Shengqiong Wu and Hao Fei and Leigang Qu and Wei Ji and Tat-Seng Chua}, + journal = {CoRR}, + volume = {abs/2309.05519}, + year={2023} +} +``` + + + + + +## Acknowledgements +You may refer to related work that serves as foundations for our framework and code repository, +[Vicuna](https://github.com/lm-sys/FastChat), +[ImageBind](https://github.com/facebookresearch/ImageBind), +[Stable Diffusion](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/text2img), +[AudioLDM](https://github.com/haoheliu/AudioLDM), and +[Zeroscope](https://huggingface.co/cerspense/zeroscope_v2_576w). +We also partially draw inspirations from +[PandaGPT](https://github.com/yxuansu/PandaGPT), +[VPGTrans](https://vpgtrans.github.io/), +[GILL](https://github.com/kohjingyu/gill/), +[CoDi](https://codi-gen.github.io/), +[Video-LLaMA](https://github.com/DAMO-NLP-SG/Video-LLaMA), +and [MiniGPT-4](https://github.com/Vision-CAIR/MiniGPT-4). +Thanks for their wonderful works. + + + + +## License Notices +This repository is under [BSD 3-Clause License](LICENSE.txt). +NExT-GPT is a research project intended for non-commercial use only. +One must NOT use the code of NExT-GPT for any illegal, harmful, violent, racist, or sexual purposes. +One is strictly prohibited from engaging in any activity that will potentially violate these guidelines. +Any potential commercial use of this code should be approved by the authors. diff --git a/ckpt/__init__.py b/ckpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ckpt/delta_ckpt/nextgpt/7b_tiva_v0/__init__.py b/ckpt/delta_ckpt/nextgpt/7b_tiva_v0/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ckpt/pretrained_ckpt/__init__.py b/ckpt/pretrained_ckpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ckpt/pretrained_ckpt/imagebind_ckpt/__init__.py b/ckpt/pretrained_ckpt/imagebind_ckpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ckpt/pretrained_ckpt/imagebind_ckpt/huge/__init__.py b/ckpt/pretrained_ckpt/imagebind_ckpt/huge/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/ckpt/pretrained_ckpt/prepare_vicuna.md b/ckpt/pretrained_ckpt/prepare_vicuna.md new file mode 100644 index 0000000000000000000000000000000000000000..9b005136f0a47ea5f52f765a49518e17513202dc --- /dev/null +++ b/ckpt/pretrained_ckpt/prepare_vicuna.md @@ -0,0 +1,80 @@ +# 1. Prepare Vicuna Checkpoint + +The language decoder of NExT-GPT relies on Vicuna version 0 which is an open-source LLaMA-based LLM. +However, due to the distribution license of LLaMA, manual restoration of Vicuna's weights is required. +Below are the instructions for restoring these weights. +(These original instruction comes from the [PandaGPT](https://github.com/yxuansu/PandaGPT)). + + +## 1.1. Prepare LLaMA Weights +* Request the original weights of LLaMA from Meta by filling [this form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform). +* After obtaining the weights of a specific LLaMA (e.g. 7B, 13B), following [instructions](https://huggingface.co/docs/transformers/main/model_doc/llama) provided by Huggingface to convert it into Huggingface format. + +> **** After conversion, the directory should look like: + + . + └── ./{path_to_llama_weights}/ + │ ├── config.json + │ ├── generation_config.json + │ ├── pytorch_model-00001-of-00002.bin + │ ├── pytorch_model-00002-of-00002.bin + │ ├── pytorch_model.bin.index.json + │ ├── special_tokens_map.json + │ ├── tokenizer.model + │ └── tokenizer_config.json + +`{path_to_llama_weights}` is where you store the checkpoints. + + +## 1.2. Prepare the Delta Weights of Vicuna + +Then, you should download the delta weights of Vicuna provided by the original authors. You can find the corresponding links to 7B/13B Vicuna models in the table below. + +|**Model Size**|**Delta Weights Address**|**Version**| +|:-------------:|:-------------:|:-------------:| +|7B|[[Link]](https://huggingface.co/lmsys/vicuna-7b-delta-v0)|0| +|13B|[[Link]](https://huggingface.co/lmsys/vicuna-13b-delta-v0)|0| + + + +> **** After conversion, the directory should look like: + + . + └── ./{path_to_delta_vicuna_weights}/ + ├── config.json + ├── generation_config.json + ├── pytorch_model-00001-of-00002.bin + ├── pytorch_model-00002-of-00002.bin + ├── pytorch_model.bin.index.json + ├── special_tokens_map.json + ├── tokenizer.model + └── tokenizer_config.json + +`{path_to_delta_vicuna_weights}` is where you store the delta weights of Vicuna. + +## 1.3. Combine the Weights: + +When the two sets of weights are ready, you can combine them using tools from the Vicuna team. + +First, install the required library. +```yaml +pip install git+https://github.com/lm-sys/FastChat.git@v0.1.10 +``` + +Then, run the following command. +```yaml +python -m fastchat.model.apply_delta --base {path_to_llama_weights} --target ./vicuna_ckpt/7b_v0/ --delta {path_to_delta_vicuna_weights} +``` + +> **** Now, the final weights are ready as: + + . + └── ./vicuna_ckpt/7b_v0/ + ├── config.json + ├── generation_config.json + ├── pytorch_model-00001-of-00002.bin + ├── pytorch_model-00002-of-00002.bin + ├── pytorch_model.bin.index.json + ├── special_tokens_map.json + ├── tokenizer.model + └── tokenizer_config.json \ No newline at end of file diff --git a/ckpt/pretrained_ckpt/vicuna_ckpt/__init__.py b/ckpt/pretrained_ckpt/vicuna_ckpt/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/__init__.py b/code/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/bot.png b/code/bot.png new file mode 100644 index 0000000000000000000000000000000000000000..0047bf66e24ff259b7ea02081316c3d881854856 Binary files /dev/null and b/code/bot.png differ diff --git a/code/config/__init__.py b/code/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..6cfacbf97b6ec4c64fb5086589887c9e2cd1e966 --- /dev/null +++ b/code/config/__init__.py @@ -0,0 +1,41 @@ +import yaml + + +def load_model_config(stage, mode): + # load special config for each model + config_path = f'config/stage_{stage}.yaml' + print(f'[!] load configuration from {config_path}') + with open(config_path) as f: + configuration = yaml.load(f, Loader=yaml.FullLoader) + new_config = {} + for key, value in configuration.items(): + if key in ['train', 'test', 'validation']: + if mode == key: + new_config.update(value) + else: + new_config[key] = value + configuration = new_config + return configuration + + +def load_config(args): + '''the configuration of each model can rewrite the base configuration''' + # base config + base_configuration = load_base_config() + + # load stage config + # if args.get('mode'): + stage_configuration = load_model_config(args['stage'], args['mode']) + + # update and append the stage config for base config + base_configuration.update(stage_configuration) + configuration = base_configuration + return configuration + + +def load_base_config(): + config_path = f'config/base.yaml' + with open(config_path) as f: + configuration = yaml.load(f, Loader=yaml.FullLoader) + print(f'[!] load base configuration: {config_path}') + return configuration diff --git a/code/config/base.yaml b/code/config/base.yaml new file mode 100644 index 0000000000000000000000000000000000000000..eb4c3b4c4c110877cb11670c24feab14726b82a4 --- /dev/null +++ b/code/config/base.yaml @@ -0,0 +1,45 @@ +# ========= system global ========== # +models: + nextgpt: + model_name: NextGPTModel + agent_name: DeepSpeedAgent + +seed: 13 +max_length: 512 # max length of the user input prompt +logging_step: 5 +num_clip_tokens: 77 +gen_emb_dim: 768 +pretrained_ckpt_path: ../ckpt/pretrained_ckpt/ + +# ========= LLM ========== # +vicuna_version: 7b_v0 # [7b_v0, ] + +# ========= multimodal encoder ========== # +imagebind_version: huge + +# ========= text-to-image alignment tuning ========== # +n_img_tokens: 4 +text_emb_to_img_layers: [-1] +num_gen_img_tokens: 4 +text_fc_to_img_mode: transformer # [qformer, transformer] + +# ========= text-to-video alignment tuning ========== # +n_video_tokens: 24 +text_emb_to_video_layers: [-1] +num_gen_video_tokens: 24 +text_fc_to_video_mode: transformer # [qformer, transformer] + +# ========= text-to-audio alignment tuning ========== # +n_audio_tokens: 8 +text_emb_to_audio_layers: [-1] +num_gen_audio_tokens: 8 +text_fc_to_audio_mode: transformer # [qformer, transformer] + +# ========= image diffusion model ========== # +image_diffusion: runwayml/stable-diffusion-v1-5 # [runwayml/stable-diffusion-v1-5, stabilityai/stable-diffusion-2] + +# ========= video diffusion model ========== # +video_diffusion: cerspense/zeroscope_v2_576w + +# ========= audio diffusion model ========== # +audio_diffusion: cvssp/audioldm-l-full # [cvssp/audioldm-l-full, cvssp/audioldm-s-full-v2] diff --git a/code/config/stage_1.yaml b/code/config/stage_1.yaml new file mode 100644 index 0000000000000000000000000000000000000000..d47328bd44d8d25fe440ef35d06a4cab6778252a --- /dev/null +++ b/code/config/stage_1.yaml @@ -0,0 +1,10 @@ +freeze_lm: true +freeze_input_proj: false +freeze_output_proj: true +prompt: 'generate a caption' # the prompting information for the enc-side alignment. +train: + warmup_rate: 0.1 + epochs: 1 + max_length: 512 + max_shard_size: 10GB + dataset_name_list: ['cc3m_enc', 'webvid_enc', 'audiocap_enc'] diff --git a/code/config/stage_2.yaml b/code/config/stage_2.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c360fba38ad11c16785f68d4bbac7986fb33a7dc --- /dev/null +++ b/code/config/stage_2.yaml @@ -0,0 +1,10 @@ +freeze_lm: true +freeze_input_proj: true +freeze_output_proj: false +prompt: '' # the prompting information for the enc-side alignment. +train: + warmup_rate: 0.1 + epochs: 1 + max_length: 512 + max_shard_size: 10GB + dataset_name_list: ['cc3m_dec', 'webvid_dec', 'audiocap_dec'] \ No newline at end of file diff --git a/code/config/stage_3.yaml b/code/config/stage_3.yaml new file mode 100644 index 0000000000000000000000000000000000000000..242bb7ae0b8a770eac79db3d242659001524f34c --- /dev/null +++ b/code/config/stage_3.yaml @@ -0,0 +1,18 @@ +# ========= lora hyper-params ========== # +lora_r: 32 +lora_alpha: 32 +lora_dropout: 0.1 + +freeze_lm: false +freeze_input_proj: false +freeze_output_proj: false +prompt: '' # the prompting information for the enc-side alignment. + +train: + warmup_rate: 0.1 + epochs: 1 + max_length: 512 + max_shard_size: 10GB + dataset_name_list: ['audio_instruction', 'video_instruction', 'image_instruction', 'llava_instruction', 'alpaca_instruction'] + + diff --git a/code/dataset/T+X-T_instruction_dataset.py b/code/dataset/T+X-T_instruction_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..496ab7c7cf4f996c45d4878b73eef1f3dfd4f66b --- /dev/null +++ b/code/dataset/T+X-T_instruction_dataset.py @@ -0,0 +1,63 @@ +import json +import os.path + +from torch.utils.data import Dataset +from tqdm import tqdm +import pandas as pd +import re +import random +import numpy as np +import torch + + +# from .base_dataset import BaseDataset + + +class TX2TInstructionDataset(Dataset): + """ + T + X - T instruction Dataset + """ + def __init__(self, data_path: str, mm_root_path: str = None, dataset_type: str='ImageToText'): + super(TX2TInstructionDataset, self).__init__() + + self.mm_root_path = mm_root_path + self.instruction_list = [] + self.mm_path_list = [] + self.dataset_category = 't2t' if mm_root_path is None else 'tx2t' + with open(data_path, 'r', encoding='utf-8') as f: + res = json.load(f) + for instance in tqdm(res, total=len(res)): + self.instruction_list.append(instance['conversation']) + if self.dataset_category == 'tx2t': + # Text + X -> Text dataset + self.mm_path_list.append(os.path.join(mm_root_path, instance['image_name'])) + self.dataset_type_list = [dataset_type for _ in range(len(self.instruction_list))] + + def __len__(self): # number of instances + return len(self.instruction_list) + + def __getitem__(self, i): + if self.dataset_category == 'tx2t': + # Text + X -> Text dataset + return dict(mm_paths=self.mm_path_list[i], output_texts=self.instruction_list[i], + dataset_types=self.dataset_type_list[i]) + else: + # Text -> Text dataset + return dict(output_texts=self.instruction_list[i], dataset_types=self.dataset_type_list[i]) + + def collate(self, instances): + if self.dataset_category == 'tx2t': + mm_paths, output_texts, dataset_types = tuple( + [instance[key] for instance in instances] for key in ("mm_paths", "output_texts", "dataset_types")) + return dict( + mm_paths=mm_paths, + output_texts=output_texts, + dataset_types=dataset_types + ) + else: + output_texts, dataset_types = tuple( + [instance[key] for instance in instances] for key in ("output_texts", "dataset_types")) + return dict( + output_texts=output_texts, + dataset_types=dataset_types + ) diff --git a/code/dataset/T-T+X_instruction_dataset.py b/code/dataset/T-T+X_instruction_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..40d34ddb42e6e7cd310f3e4106eaa63068552ea3 --- /dev/null +++ b/code/dataset/T-T+X_instruction_dataset.py @@ -0,0 +1,49 @@ +import json +import os.path + +from torch.utils.data import Dataset +from tqdm import tqdm +import pandas as pd +import re +import random +import numpy as np +import torch + + +# from .base_dataset import BaseDataset + + +class T2XTInstructionDataset(Dataset): + """ + T - T + X instruction Dataset + """ + def __init__(self, data_path: str, embed_path: str, dataset_type: str = "TextToImage"): + super(T2XTInstructionDataset, self).__init__() + + self.embed_path = embed_path + self.instruction_list = [] + self.mm_path_list = [] + with open(data_path, 'r', encoding='utf-8') as f: + res = json.load(f) + for instance in tqdm(res, total=len(res)): + self.instruction_list.append(instance['conversation']) + self.mm_path_list.append(instance['mm_name']) + self.dataset_type_list = [dataset_type for _ in range(len(self.instruction_list))] + + def __len__(self): # number of instances + return len(self.instruction_list) + + def __getitem__(self, i): + with open(os.path.join(self.embed_path, str(os.path.basename(self.mm_path_list[i])) + '.npy'), 'rb') as f: + caption_embs = torch.from_numpy(np.load(f, allow_pickle=True)) # (num_clip_tokens, 768) + + return dict(output_texts=self.instruction_list[i], caption_embs=caption_embs, dataset_types=self.dataset_type_list[i]) + + def collate(self, instances): + output_texts, caption_embs, dataset_types = tuple( + [instance[key] for instance in instances] for key in ("output_texts", "caption_embs", "dataset_types")) + return dict( + output_texts=output_texts, + caption_embs=caption_embs, + dataset_types=dataset_types + ) diff --git a/code/dataset/__init__.py b/code/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90b9faa69c0562038a5dd39b3cbcb96ee628e1a4 --- /dev/null +++ b/code/dataset/__init__.py @@ -0,0 +1,37 @@ +from header import * +from .samplers import DistributedBatchSampler, DistributedMultiDatasetBatchSampler +from .catalog import DatasetCatalog +from .utils import instantiate_from_config +import torch +from torch.utils.data import ConcatDataset +from .concat_dataset import MyConcatDataset + + +def load_dataset(args, dataset_name_list): + """ + Args: + args: + dataset_name_list: List[str] + repeats: List[int], the training epochs for each dataset + + """ + # concat_data = get_concat_dataset(dataset_name_list) + concat_data = MyConcatDataset(dataset_name_list) + world_size = torch.distributed.get_world_size() + rank = torch.distributed.get_rank() + batch_size = args['world_size'] * args['dschf'].config['train_micro_batch_size_per_gpu'] + sampler = torch.utils.data.RandomSampler(concat_data) + batch_sampler = DistributedMultiDatasetBatchSampler(dataset=concat_data, + sampler=sampler, + batch_size=batch_size, + drop_last=True, + rank=rank, + world_size=world_size) + iter_ = DataLoader( + concat_data, + batch_sampler=batch_sampler, + num_workers=1, + collate_fn=concat_data.collate, + pin_memory=True + ) + return concat_data, iter_, sampler diff --git a/code/dataset/audiocap_dataset.py b/code/dataset/audiocap_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..b5e3cc0b2ce3ba8063f97d01e46918333665d37c --- /dev/null +++ b/code/dataset/audiocap_dataset.py @@ -0,0 +1,55 @@ +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import json +from tqdm import tqdm +import ipdb +import random +from torch.nn.utils.rnn import pad_sequence +from dataclasses import dataclass, field +from typing import Callable, Dict, Sequence + +import torch +import torch.distributed as dist +import transformers +import numpy as np +from torch.utils.data import Dataset +from .base_dataset import BaseDataset +from tqdm import tqdm +import pandas as pd +from .utils import process_caption + + +class AudioCapDataset(BaseDataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, mm_root_path: str, embed_path: str, dataset_type: str): + super(AudioCapDataset, self).__init__(data_path, mm_root_path, embed_path, dataset_type) + self.embed_path = embed_path + + print('Load Audiocap dataset ...') + self.mm_path_list, self.caption_list = [], [] + with open(data_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for row in tqdm(data, total=len(data)): + audio_id, one_caption = row["audio_name"], row["caption"] + self.mm_path_list.append(os.path.join(mm_root_path, audio_id)) + self.caption_list.append(process_caption(one_caption)) + + print(f'[!] collect {len(self.mm_path_list)} samples for training') + self.dataset_type_list = [dataset_type for _ in range(len(self.caption_list))] + + diff --git a/code/dataset/base_dataset.py b/code/dataset/base_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..7461e822c5bafafe81d951cc0710b73c2ec7fc38 --- /dev/null +++ b/code/dataset/base_dataset.py @@ -0,0 +1,55 @@ +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import torch +import numpy as np +import json +from torch.utils.data import Dataset +from tqdm import tqdm +import pandas as pd +from .utils import process_caption + + +class BaseDataset(Dataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, mm_root_path: str, embed_path: str, dataset_type: str): + super(BaseDataset, self).__init__() + self.embed_path = embed_path + self.mm_path_list, self.caption_list = [], [] + self.dataset_type_list = [] + + def __len__(self): # number of instances + return len(self.mm_path_list) + + def __getitem__(self, i): + with open(os.path.join(self.embed_path, str(os.path.basename(self.mm_path_list[i])) + '.npy'), 'rb') as f: + caption_embs = torch.from_numpy(np.load(f, allow_pickle=True)) # (num_clip_tokens, 768) + + return dict(mm_paths=self.mm_path_list[i], output_texts=self.caption_list[i], caption_embs=caption_embs, + dataset_types=self.dataset_type_list[i]) + + def collate(self, instances): + mm_paths, output_texts, caption_embs, dataset_types = tuple( + [instance[key] for instance in instances] for key in + ("mm_paths", "output_texts", "caption_embs", "dataset_types")) + return dict( + mm_paths=mm_paths, + output_texts=output_texts, + caption_embs=caption_embs, + dataset_types=dataset_types + ) + diff --git a/code/dataset/catalog.py b/code/dataset/catalog.py new file mode 100644 index 0000000000000000000000000000000000000000..9a8564ae0d07683fa716e5e15a5c4b9ea01abef7 --- /dev/null +++ b/code/dataset/catalog.py @@ -0,0 +1,125 @@ +import os + + +class DatasetCatalog: + def __init__(self): + # the following dataset utilized for encoding-side alignment learning + self.audiocap_enc = { + "target": "dataset.audiocap_dataset.AudioCapDataset", + "params": dict( + data_path="../data/T-X_pair_data/audiocap/audiocap.json", + mm_root_path="../data/T-X_pair_data/audiocap/audios", + embed_path="../data/embed/", + dataset_type="AudioToText", + ), + } + + self.webvid_enc = { + "target": "dataset.webvid_dataset.WebvidDataset", + "params": dict( + data_path="../data/T-X_pair_data/webvid/webvid.json", + mm_root_path="../data/T-X_pair_data/webvid/videos", + embed_path="../data/embed/", + dataset_type="VideoToText", + ), + } + + self.cc3m_enc = { + "target": "dataset.cc3m_dataset.CC3MDataset", + "params": dict( + data_path="../data/T-X_pair_data/cc3m/cc3m.json", + mm_root_path="../data/T-X_pair_data/cc3m/images", + embed_path="../data/embed/", + dataset_type="ImageToText", + ), + } + + # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # + + # the following dataset utilized for decoding-side alignment learning. + + self.audiocap_dec = { + "target": "dataset.audiocap_dataset.AudioCapDataset", + "params": dict( + data_path="../data/T-X_pair_data/audiocap/audiocap.json", + mm_root_path="../data/T-X_pair_data/audiocap/audios", + embed_path="../data/embed/", + dataset_type="TextToAudio", + ), + } + + self.webvid_dec = { + "target": "dataset.webvid_dataset.WebvidDataset", + "params": dict( + data_path="../data/T-X_pair_data/webvid/webvid.json", + mm_root_path="../data/T-X_pair_data/webvid/videos", + embed_path="../data/embed/", + dataset_type="TextToVideo", + ), + } + + self.cc3m_dec = { + "target": "dataset.cc3m_dataset.CC3MDataset", + "params": dict( + data_path="../data/T-X_pair_data/cc3m/cc3m.json", + mm_root_path="../data/T-X_pair_data/cc3m/images", + embed_path="../data/embed/", + dataset_type="TextToImage", + ), + } + + # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # + + # the following dataset utilized for instruction tuning, so they are instruction dataset. + self.audio_instruction = { + "target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset", + "params": dict( + data_path="../data/IT_data/T-T+X_data/audio_t2x.json", + embed_path="./embed/", + dataset_type="TextToAudio", + ), + } + + self.video_instruction = { + "target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset", + "params": dict( + data_path="../data/IT_data/T-T+X_data/video_t2x.json", + embed_path="./embed/", + dataset_type="TextToVideo", + ), + } + + self.image_instruction = { + "target": "dataset.T-T+X_instruction_dataset.T2XTInstructionDataset", + "params": dict( + data_path="../data/IT_data/T-T+X_data/image_t2x.json", + embed_path="./embed/", + dataset_type="TextToImage", + + ), + } + + self.llava_instruction = { + "target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset", + "params": dict( + data_path="../data/IT_data/T+X-T_data/llava/llava.json", + mm_root_path="../data/IT_data/T+X-T_data/llava/images", + dataset_type="ImageToText", + ), + } + + self.alpaca_instruction = { + "target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset", + "params": dict( + data_path="../data/IT_data/T+X-T_data/alpaca/alpaca.json", + dataset_type="TextToText", + ), + } + + self.videochat_instruction = { + "target": "dataset.T+X-T_instruction_dataset.TX2TInstructionDataset", + "params": dict( + data_path="../data/IT_data/T+X-T_data/videochat/videochat.json", + dataset_type="VideoToText", + ), + } diff --git a/code/dataset/cc3m_dataset.py b/code/dataset/cc3m_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..03f54e83229061abe6524a27a7179d443f78b1fb --- /dev/null +++ b/code/dataset/cc3m_dataset.py @@ -0,0 +1,45 @@ +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import torch +import numpy as np +import json +from .base_dataset import BaseDataset +from torch.utils.data import Dataset +from tqdm import tqdm +import pandas as pd +from .utils import process_caption + + +class CC3MDataset(BaseDataset): + """Dataset for supervised fine-tuning.""" + + def __init__(self, data_path: str, mm_root_path: str, embed_path: str, dataset_type: str): + super(CC3MDataset, self).__init__(data_path, mm_root_path, embed_path, dataset_type) + self.embed_path = embed_path + + print('Load CC3M dataset ...') + self.mm_path_list, self.caption_list = [], [] + with open(data_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for row in tqdm(data, total=len(data)): + image_id, one_caption = row["image_name"], row["caption"] + self.mm_path_list.append(os.path.join(mm_root_path, image_id)) + self.caption_list.append(process_caption(one_caption)) + + print(f'[!] collect {len(self.mm_path_list)} samples for training') + self.dataset_type_list = [dataset_type for _ in range(len(self.caption_list))] + diff --git a/code/dataset/concat_dataset.py b/code/dataset/concat_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..eaeaa189d10a2c8e8f1b94bb830597a4727c6065 --- /dev/null +++ b/code/dataset/concat_dataset.py @@ -0,0 +1,38 @@ +from torch.utils.data import ConcatDataset, Dataset +from .catalog import DatasetCatalog +from .utils import instantiate_from_config + + +class MyConcatDataset(Dataset): + def __init__(self, dataset_name_list): + super(MyConcatDataset, self).__init__() + + _datasets = [] + + catalog = DatasetCatalog() + for dataset_idx, dataset_name in enumerate(dataset_name_list): + dataset_dict = getattr(catalog, dataset_name) + + target = dataset_dict['target'] + params = dataset_dict['params'] + print(target) + print(params) + dataset = instantiate_from_config(dict(target=target, params=params)) + + _datasets.append(dataset) + self.datasets = ConcatDataset(_datasets) + + def __len__(self): + return self.datasets.__len__() + + def __getitem__(self, item): + return self.datasets.__getitem__(item) + + def collate(self, instances): + data = {key: [] for key in instances[0].keys()} if instances else {} + + for instance in instances: + for key, value in instance.items(): + data[key].append(value) + + return data diff --git a/code/dataset/preprocess_dataset.py b/code/dataset/preprocess_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..6caec9c899b02847e4ca4a3ef4265bdf94f8ec3f --- /dev/null +++ b/code/dataset/preprocess_dataset.py @@ -0,0 +1,144 @@ +import json +import os.path + +from torch.utils.data import Dataset +from tqdm import tqdm +import pandas as pd +import re +import random +import numpy as np +import torch + + +def load_alpaca(data_path, sample_data=False, sample_numer=1000, save_dir=''): + """ + sample and process the alpaca dataset in to the following format: + [ + { + "image_name": "00000000000", + "output_modality": "text", + "conversation": [ + { + "from": "human", + "value": "Give three tips for staying healthy.", + "input_modality": "text" + }, + { + "from": "gpt", + "value": "1. Eat a balanced and nutritious diet: ...", + "caption": "", + "output_modality": "text" + } + ] + }, + ... + ] + """ + with open(data_path, 'r') as f: + data = json.load(f) + print('the total instance is {}'.format(len(data))) + if sample_data and sample_numer > 0: + data = random.sample(data, sample_numer) + res = [] + for d in data: + _temp = dict() + _temp['image_name'] = '00000000000' + _temp['output_modality'] = 'text' + conversation = [] + + conversation.append( + {'from': 'human', + 'value': d['instruction'] + d['input'], + 'input_modality': 'text'} + ) + conversation.append( + {'from': 'gpt', + 'value': d['output'], + 'caption': '', + 'output_modality': 'text'} + ) + _temp['conversation'] = conversation + res.append(_temp) + if not os.path.exists(save_dir): + os.makedirs(save_dir) + save_path = os.path.join(save_dir, os.path.basename(data_path)) + with open(save_path, 'w', encoding='utf-8') as f: + json.dump(res, f, indent=4) + return res + + +def load_llava(data_path, sample_data=False, sample_numer=1000, save_dir=''): + """ + sample and process the llava instruction dataset into the following format: + [ + { + "image_name": "00000000000.jpg", + "output_modality": "text", + "conversation": [ + { + "from": "human", + "value": "Give three tips for staying healthy.", + "input_modality": "image" + }, + { + "from": "gpt", + "value": "1. Eat a balanced and nutritious diet: ...", + "caption": "", + "output_modality": "text" + } + ] + }, + ... + ] + """ + with open(data_path, 'r') as f: + data = json.load(f) + print('the total instance is {}'.format(len(data))) + if sample_data and sample_numer > 0: + res = random.sample(data, sample_numer) + else: + res = data + # res = data + save_path = os.path.join(save_dir, os.path.basename(data_path)) + for x in res: + i = 0 + x['output_modality'] = 'text' + for j in x['conversation']: + if j['from'] == 'gpt': + j['caption'] = '' + j['output_modality'] = 'text' + elif j['from'] == 'human': + if i == 0: + j['input_modality'] = 'image' + i += 1 + else: + j['input_modality'] = 'text' + with open(save_path, 'w', encoding='utf-8') as f: + json.dump(res, f, indent=4) + return res + + +def load_t2x(data_path): + with open(data_path, 'r', encoding='utf-8') as f: + data = json.load(f) + return data + + +if __name__ == '__main__': + save_dir = '../../data/IT_data/T+X-T_data' + res = [] + + # audios = load_t2x(os.path.join(save_dir, 'audio_t2x.json')) + # videos = load_t2x(os.path.join(save_dir, 'video_t2x.json')) + # images = load_t2x(os.path.join(save_dir, 'image_t2x.json')) + # sample_number = max(len(audios), len(videos), len(images)) + # + # print(sample_number) + sample_number = 1000 + + print('Load aplaca dataset ...') + text = load_alpaca('../../data/IT_data/T+X-T_data/alpaca/alpaca.json', False, sample_number, save_dir) + res.extend(text) + + print('Load llava dataset ...') + data = load_llava('../../data/IT_data/T+X-T_data/llava/llava.json', False, sample_number, save_dir) diff --git a/code/dataset/samplers.py b/code/dataset/samplers.py new file mode 100644 index 0000000000000000000000000000000000000000..cc1bb1aa72a65df314a2763468788c5090e5131a --- /dev/null +++ b/code/dataset/samplers.py @@ -0,0 +1,221 @@ +# coding=utf-8 +# Copyright (c) 2019, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""batch samplers that work with either random or sequential data samplers""" +import math +import os +import sys + +import torch +from torch.utils import data +import numpy as np + + +class RandomSampler(data.sampler.Sampler): + r""" + Based off of pytorch RandomSampler and DistributedSampler. Essentially a RandomSampler, + but this class lets the user set an epoch like DistributedSampler + Samples elements randomly. If without replacement, then sample from a shuffled dataset. + If with replacement, then user can specify ``num_samples`` to draw. + Arguments: + data_source (Dataset): dataset to sample from + num_samples (int): number of samples to draw, default=len(dataset) + replacement (bool): samples are drawn with replacement if ``True``, default=False + """ + + def __init__(self, data_source, replacement=False, num_samples=None): + super(RandomSampler, self).__init__(data_source) + self.data_source = data_source + self.replacement = replacement + self._num_samples = num_samples + self.epoch = -1 + + if self._num_samples is not None and replacement is False: + raise ValueError("With replacement=False, num_samples should not be specified, " + "since a random permute will be performed.") + + if not isinstance(self.num_samples, int) or self.num_samples <= 0: + raise ValueError("num_samples should be a positive integer " + "value, but got num_samples={}".format(self.num_samples)) + if not isinstance(self.replacement, bool): + raise ValueError("replacement should be a boolean value, but got " + "replacement={}".format(self.replacement)) + + @property + def num_samples(self): + # dataset size might change at runtime + if self._num_samples is None: + return len(self.data_source) + return self._num_samples + + def __iter__(self): + n = len(self.data_source) + g = torch.Generator() + if self.epoch >= 0: + g.manual_seed(self.epoch) + if self.replacement: + for _ in range(self.num_samples // 32): + yield from torch.randint(high=n, size=(32,), dtype=torch.int64, generator=g).tolist() + yield from torch.randint(high=n, size=(self.num_samples % 32,), dtype=torch.int64, + generator=g).tolist() + else: + yield from torch.randperm(n, generator=self.generator).tolist() + + def __len__(self): + return self.num_samples + + def set_epoch(self, epoch): + self.epoch = epoch + + +class DistributedSequentialSampler(data.sampler.Sampler): + def __init__(self, num_samples, train_iters, batch_size, rank=-1, world_size=2): + super().__init__(num_samples) + if rank == -1: + rank = 0 + world_size = 1 + self.num_samples = num_samples + self.rank = rank + self.world_size = world_size + self.start_iter = 0 + self.train_iters = train_iters + self.batch_size = batch_size + self.batch_bias = [i * (num_samples // batch_size) for i in range(batch_size)] + + def __iter__(self): + for idx in range(self.start_iter, self.train_iters * 10): + batch = [(idx + bias) % self.num_samples for bias in self.batch_bias] + tbatch = self._batch(batch) + yield tbatch + + def __len__(self): + return self.train_iters + + def _batch(self, batch): + """extracts samples only pertaining to this worker's batch""" + start = self.rank*self.batch_size//self.world_size + end = (self.rank+1)*self.batch_size//self.world_size + return batch[start:end] + + +class DistributedBatchSampler(data.sampler.BatchSampler): + """ + similar to normal implementation of distributed sampler, except implementation is at the + batch sampler level, instead of just the sampler level. This allows wrapping of arbitrary + data samplers (sequential, random, WeightedRandomSampler, etc.) with this batch sampler. + """ + def __init__(self, sampler, batch_size, drop_last, rank=-1, world_size=2, wrap_last=False, gradient_accumulation_steps=None): + super(DistributedBatchSampler, self).__init__(sampler, batch_size, drop_last) + if rank == -1: + assert False, 'should not be here' + self.rank = rank + self.world_size = world_size + self.sampler.wrap_around = 0 + self.wrap_around = 0 + self.wrap_last = wrap_last + self.start_iter = 0 + self.effective_batch_size = batch_size if gradient_accumulation_steps is None else batch_size * gradient_accumulation_steps + + def __iter__(self): + batch = [] + i = 0 + for idx in self.data_iterator(self.sampler, wrap_around=False): + batch.append(idx) + if len(batch) == self.batch_size: + tbatch = self._batch(batch) + if i >= self.start_iter * self.effective_batch_size: + yield tbatch + self.start_iter = 0 + i += len(batch) + batch = [] + batch_len = len(batch) + if batch_len > 0 and not self.drop_last: + if self.wrap_last: + self.sampler.wrap_around -= (self.batch_size) + self.wrap_around += (len(batch)) + self.wrap_around %= self.batch_size + yield self._batch(batch) + if self.wrap_last: + self.sampler.wrap_around += self.batch_size + + def data_iterator(self, _iter, wrap_around=False): + """iterates through data and handles wrap around""" + for i, idx in enumerate(_iter): + if i < self.wrap_around%self.batch_size: + continue + if wrap_around: + self.wrap_around += 1 + self.wrap_around %= self.batch_size + yield idx + + def _batch(self, batch): + """extracts samples only pertaining to this worker's batch""" + start = self.rank*self.batch_size//self.world_size + end = (self.rank+1)*self.batch_size//self.world_size + return batch[start:end] + + +class DistributedMultiDatasetBatchSampler(data.sampler.BatchSampler): + """ + This is a modality-blended batch sampler which allows to sample a batch data from different dataset alternatively. + """ + def __init__(self, sampler, batch_size, dataset, drop_last, rank=-1, world_size=2, wrap_last=False, gradient_accumulation_steps=None): + super(DistributedMultiDatasetBatchSampler, self).__init__(sampler, batch_size, drop_last) + if rank == -1: + assert False, 'should not be here' + self.rank = rank + self.world_size = world_size + self.wrap_last = wrap_last + self.drop_last = drop_last + self.gradient_accumulation_steps = gradient_accumulation_steps + self.dataset = dataset + self.batch_size = batch_size + self.number_of_datasets = len(dataset.datasets.datasets) + self.largest_dataset_size = max([_cur_dataset.__len__() for _cur_dataset in dataset.datasets.datasets]) + + def __iter__(self): + samplers_list = [] + sampler_iterators = [] + for dataset_idx in range(self.number_of_datasets): + cur_dataset = self.dataset.datasets.datasets[dataset_idx] + sampler = torch.utils.data.RandomSampler(cur_dataset) + batch_sampler = DistributedBatchSampler(sampler, self.batch_size, self.drop_last, self.rank, + self.world_size, self.wrap_last, self.gradient_accumulation_steps) + samplers_list.append(batch_sampler) + cur_sampler_iterator = batch_sampler.__iter__() + sampler_iterators.append(cur_sampler_iterator) + + push_index_val = [0] + self.dataset.datasets.cumulative_sizes[:-1] + step = self.batch_size * self.number_of_datasets + samples_to_grab = self.batch_size + # for this case we want to get all samples in dataset, this force us to resample from the smaller datasets + epoch_samples = self.largest_dataset_size * self.number_of_datasets + + for _ in range(0, epoch_samples, step): + for i in range(self.number_of_datasets): + # for j in range(self.world_size): + cur_batch_sampler = sampler_iterators[i] + try: + cur_sample_org = cur_batch_sampler.__next__() + cur_samples = [x + push_index_val[i] for x in cur_sample_org] + yield cur_samples + except StopIteration: + # got to the end of iterator - restart the iterator and continue to get samples + # until reaching "epoch_samples" + sampler_iterators[i] = samplers_list[i].__iter__() + cur_batch_sampler = sampler_iterators[i] + cur_sample_org = cur_batch_sampler.__next__() + cur_samples = [x + push_index_val[i] for x in cur_sample_org] + yield cur_samples + diff --git a/code/dataset/utils.py b/code/dataset/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4ce1ac14375dec2eb1f34d378deff0fd958c84a9 --- /dev/null +++ b/code/dataset/utils.py @@ -0,0 +1,37 @@ +from header import * +import importlib + + +def process_caption(caption): + caption = re.sub( + r"([\"()*#:;~])", + " ", + caption.lower(), + ) + caption = re.sub( + r"\s{2,}", + " ", + caption, + ) + caption = caption.rstrip("\n") + caption = caption.strip(" ") + + return caption + + +def instantiate_from_config(config): + if not "target" in config: + if config == '__is_first_stage__': + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict())) + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) diff --git a/code/dataset/webvid_dataset.py b/code/dataset/webvid_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..3d50d8c62993df3e1a0ad365063c203b431d0c7a --- /dev/null +++ b/code/dataset/webvid_dataset.py @@ -0,0 +1,44 @@ +# Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import os +import json +import numpy as np +from torch.utils.data import Dataset +from .base_dataset import BaseDataset +from tqdm import tqdm +import pandas as pd +from .utils import process_caption +import torch + + +class WebvidDataset(BaseDataset): + """webvid Dataset with video-text pairs.""" + + def __init__(self, data_path: str, mm_root_path: str, embed_path: str, dataset_type: str): + super(WebvidDataset, self).__init__(data_path, mm_root_path, embed_path, dataset_type) + self.embed_path = embed_path + + print('Load WebVid dataset ...') + self.mm_path_list, self.caption_list = [], [] + with open(data_path, 'r', encoding='utf-8') as f: + data = json.load(f) + for row in tqdm(data, total=len(data)): + video_id, one_caption = row["video_name"], row["caption"] + self.mm_path_list.append(os.path.join(mm_root_path, video_id)) + self.caption_list.append(process_caption(one_caption)) + + print(f'[!] collect {len(self.mm_path_list)} samples for training') + self.dataset_type_list = [dataset_type for _ in range(len(self.caption_list))] diff --git a/code/demo_app.py b/code/demo_app.py new file mode 100644 index 0000000000000000000000000000000000000000..3d3e9ce7121df09ddc6f87b7ea04a5dcdef80f72 --- /dev/null +++ b/code/demo_app.py @@ -0,0 +1,516 @@ +from transformers import AutoModel, AutoTokenizer +from copy import deepcopy +import os +import ipdb +import gradio as gr +import mdtex2html +from model.anyToImageVideoAudio import NextGPTModel +import torch +import json +import tempfile +from PIL import Image +import scipy +from config import * +import imageio +import argparse +import re + +# init the model + +parser = argparse.ArgumentParser(description='train parameters') +parser.add_argument('--model', type=str, default='nextgpt') +parser.add_argument('--nextgpt_ckpt_path', type=str) # the delta parameters trained in each stages +parser.add_argument('--stage', type=int, default=3) +args = parser.parse_args() +args = vars(args) +args.update(load_config(args)) +model = NextGPTModel(**args) +delta_ckpt = torch.load(os.path.join(args['nextgpt_ckpt_path'], f'pytorch_model.pt'), map_location=torch.device('cpu')) +model.load_state_dict(delta_ckpt, strict=False) +model = model.eval().half().cuda() +print(f'[!] init the 7b model over ...') + +g_cuda = torch.Generator(device='cuda').manual_seed(13) + +filter_value = -float('Inf') +min_word_tokens = 10 +gen_scale_factor = 4.0 +stops_id = [[835]] +ENCOUNTERS = 1 +load_sd = True +generator = g_cuda + +max_num_imgs = 1 +max_num_vids = 1 +height = 320 +width = 576 + +max_num_auds = 1 +max_length = 246 + +"""Override Chatbot.postprocess""" + + +def postprocess(self, y): + if y is None: + return [] + for i, (message, response) in enumerate(y): + y[i] = ( + None if message is None else mdtex2html.convert((message)), + None if response is None else mdtex2html.convert(response), + ) + return y + + +gr.Chatbot.postprocess = postprocess + + +def parse_text(text, image_path, video_path, audio_path): + """copy from https://github.com/GaiZhenbiao/ChuanhuChatGPT/""" + outputs = text + lines = text.split("\n") + lines = [line for line in lines if line != ""] + count = 0 + for i, line in enumerate(lines): + if "```" in line: + count += 1 + items = line.split('`') + if count % 2 == 1: + lines[i] = f'
'
+            else:
+                lines[i] = f'
' + else: + if i > 0: + if count % 2 == 1: + line = line.replace("`", "\`") + line = line.replace("<", "<") + line = line.replace(">", ">") + line = line.replace(" ", " ") + line = line.replace("*", "*") + line = line.replace("_", "_") + line = line.replace("-", "-") + line = line.replace(".", ".") + line = line.replace("!", "!") + line = line.replace("(", "(") + line = line.replace(")", ")") + line = line.replace("$", "$") + lines[i] = "
" + line + text = "".join(lines) + "
" + res_text = '' + split_text = re.split(r' <|> ', text) + image_path_list, video_path_list, audio_path_list = [], [], [] + for st in split_text: + if st.startswith(''): + pattern = r'Image>(.*?)<\/Image' + matches = re.findall(pattern, text) + for m in matches: + image_path_list.append(m) + elif st.startswith('