diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index b865f6c33d..5cab55b826 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -57,50 +57,50 @@ body: description: | Your issue will be replied to more quickly if you can figure out the right person to tag with @. If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**. - + All issues are read by one of the core maintainers, so if you don't know who to tag, just leave this blank and a core maintainer will ping the right person. - + Please tag a maximum of 2 people. Questions on DiffusionPipeline (Saving, Loading, From pretrained, ...): Questions on pipelines: - - Stable Diffusion @yiyixuxu @DN6 @sayakpaul - - Stable Diffusion XL @yiyixuxu @sayakpaul @DN6 - - Kandinsky @yiyixuxu - - ControlNet @sayakpaul @yiyixuxu @DN6 - - T2I Adapter @sayakpaul @yiyixuxu @DN6 - - IF @DN6 - - Text-to-Video / Video-to-Video @DN6 @sayakpaul - - Wuerstchen @DN6 + - Stable Diffusion @yiyixuxu @DN6 @sayakpaul + - Stable Diffusion XL @yiyixuxu @sayakpaul @DN6 + - Kandinsky @yiyixuxu + - ControlNet @sayakpaul @yiyixuxu @DN6 + - T2I Adapter @sayakpaul @yiyixuxu @DN6 + - IF @DN6 + - Text-to-Video / Video-to-Video @DN6 @sayakpaul + - Wuerstchen @DN6 - Other: @yiyixuxu @DN6 Questions on models: - - UNet @DN6 @yiyixuxu @sayakpaul - - VAE @sayakpaul @DN6 @yiyixuxu - - Transformers/Attention @DN6 @yiyixuxu @sayakpaul @DN6 + - UNet @DN6 @yiyixuxu @sayakpaul + - VAE @sayakpaul @DN6 @yiyixuxu + - Transformers/Attention @DN6 @yiyixuxu @sayakpaul @DN6 - Questions on Schedulers: @yiyixuxu + Questions on Schedulers: @yiyixuxu - Questions on LoRA: @sayakpaul + Questions on LoRA: @sayakpaul - Questions on Textual Inversion: @sayakpaul + Questions on Textual Inversion: @sayakpaul - Questions on Training: - - DreamBooth @sayakpaul - - Text-to-Image Fine-tuning @sayakpaul - - Textual Inversion @sayakpaul - - ControlNet @sayakpaul + Questions on Training: + - DreamBooth @sayakpaul + - Text-to-Image Fine-tuning @sayakpaul + - Textual Inversion @sayakpaul + - ControlNet @sayakpaul - Questions on Tests: @DN6 @sayakpaul @yiyixuxu + Questions on Tests: @DN6 @sayakpaul @yiyixuxu Questions on Documentation: @stevhliu Questions on JAX- and MPS-related things: @pcuenca - Questions on audio pipelines: @DN6 - + Questions on audio pipelines: @DN6 + + - placeholder: "@Username ..." diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index a0337eaaaa..12d3e03d0d 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -38,9 +38,9 @@ members/contributors who may be interested in your PR. Core library: -- Schedulers: @yiyixuxu +- Schedulers: @yiyixuxu - Pipelines: @sayakpaul @yiyixuxu @DN6 -- Training examples: @sayakpaul +- Training examples: @sayakpaul - Docs: @stevhliu and @sayakpaul - JAX and MPS: @pcuenca - Audio: @sanchit-gandhi diff --git a/.github/workflows/mirror_community_pipeline.yml b/.github/workflows/mirror_community_pipeline.yml index 8886df8510..a20c95ba95 100644 --- a/.github/workflows/mirror_community_pipeline.yml +++ b/.github/workflows/mirror_community_pipeline.yml @@ -36,7 +36,7 @@ jobs: # If ref is 'refs/heads/main' => set 'main' # Else it must be a tag => set {tag} - name: Set checkout_ref and path_in_repo - run: | + run: | if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then if [ -z "${{ github.event.inputs.ref }}" ]; then echo "Error: Missing ref input" diff --git a/.github/workflows/notify_slack_about_release.yml b/.github/workflows/notify_slack_about_release.yml index 95f2d0f917..f33e917f09 100644 --- a/.github/workflows/notify_slack_about_release.yml +++ b/.github/workflows/notify_slack_about_release.yml @@ -11,12 +11,12 @@ jobs: steps: - uses: actions/checkout@v3 - + - name: Setup Python uses: actions/setup-python@v4 with: python-version: '3.8' - + - name: Notify Slack about the release env: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/pr_dependency_test.yml b/.github/workflows/pr_dependency_test.yml index f21f09ef87..c4bd86112f 100644 --- a/.github/workflows/pr_dependency_test.yml +++ b/.github/workflows/pr_dependency_test.yml @@ -33,4 +33,3 @@ jobs: run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" pytest tests/others/test_dependencies.py - \ No newline at end of file diff --git a/.github/workflows/pr_test_peft_backend.yml b/.github/workflows/pr_test_peft_backend.yml index 2e2f2201e7..9c5492a744 100644 --- a/.github/workflows/pr_test_peft_backend.yml +++ b/.github/workflows/pr_test_peft_backend.yml @@ -115,14 +115,14 @@ jobs: -s -v \ --make-reports=tests_models_lora_${{ matrix.config.report }} \ tests/models/ -k "lora" - - + + - name: Failure short reports if: ${{ failure() }} run: | cat reports/tests_${{ matrix.config.report }}_failures_short.txt cat reports/tests_models_lora_${{ matrix.config.report }}_failures_short.txt - + - name: Test suite reports artifacts if: ${{ always() }} uses: actions/upload-artifact@v2 diff --git a/.github/workflows/pypi_publish.yaml b/.github/workflows/pypi_publish.yaml index 54e9afe6d9..6b09f60a35 100644 --- a/.github/workflows/pypi_publish.yaml +++ b/.github/workflows/pypi_publish.yaml @@ -29,7 +29,7 @@ jobs: LATEST_BRANCH=$(python utils/fetch_latest_release_branch.py) echo "Latest branch: $LATEST_BRANCH" echo "latest_branch=$LATEST_BRANCH" >> $GITHUB_ENV - + - name: Set latest branch output id: set_latest_branch run: echo "::set-output name=latest_branch::${{ env.latest_branch }}" @@ -43,27 +43,27 @@ jobs: uses: actions/checkout@v3 with: ref: ${{ needs.find-and-checkout-latest-branch.outputs.latest_branch }} - + - name: Setup Python uses: actions/setup-python@v4 with: python-version: "3.8" - + - name: Install dependencies run: | python -m pip install --upgrade pip pip install -U setuptools wheel twine pip install -U torch --index-url https://download.pytorch.org/whl/cpu pip install -U transformers - + - name: Build the dist files run: python setup.py bdist_wheel && python setup.py sdist - + - name: Publish to the test PyPI env: TWINE_USERNAME: ${{ secrets.TEST_PYPI_USERNAME }} TWINE_PASSWORD: ${{ secrets.TEST_PYPI_PASSWORD }} - run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/ + run: twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/ - name: Test installing diffusers and importing run: | diff --git a/.github/workflows/run_tests_from_a_pr.yml b/.github/workflows/run_tests_from_a_pr.yml index 782c0db417..75cb496362 100644 --- a/.github/workflows/run_tests_from_a_pr.yml +++ b/.github/workflows/run_tests_from_a_pr.yml @@ -7,7 +7,7 @@ on: default: 'diffusers/diffusers-pytorch-cuda' description: 'Name of the Docker image' required: true - branch: + branch: description: 'PR Branch to test on' required: true test: @@ -34,19 +34,19 @@ jobs: steps: - name: Validate test files input id: validate_test_files - env: + env: PY_TEST: ${{ github.event.inputs.test }} run: | if [[ ! "$PY_TEST" =~ ^tests/ ]]; then echo "Error: The input string must start with 'tests/'." exit 1 fi - + if [[ ! "$PY_TEST" =~ ^tests/(models|pipelines) ]]; then echo "Error: The input string must contain either 'models' or 'pipelines' after 'tests/'." exit 1 fi - + if [[ "$PY_TEST" == *";"* ]]; then echo "Error: The input string must not contain ';'." exit 1 @@ -60,14 +60,14 @@ jobs: repository: ${{ github.event.pull_request.head.repo.full_name }} - - name: Install pytest - run: | + - name: Install pytest + run: | python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH" python -m uv pip install -e [quality,test] python -m uv pip install peft - + - name: Run tests - env: + env: PY_TEST: ${{ github.event.inputs.test }} run: | pytest "$PY_TEST" \ No newline at end of file diff --git a/docs/source/en/api/models/pixart_transformer2d.md b/docs/source/en/api/models/pixart_transformer2d.md index 5ddfabc618..1d392f4e7c 100644 --- a/docs/source/en/api/models/pixart_transformer2d.md +++ b/docs/source/en/api/models/pixart_transformer2d.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # PixArtTransformer2DModel -A Transformer model for image-like data from [PixArt-Alpha](https://huggingface.co/papers/2310.00426) and [PixArt-Sigma](https://huggingface.co/papers/2403.04692). +A Transformer model for image-like data from [PixArt-Alpha](https://huggingface.co/papers/2310.00426) and [PixArt-Sigma](https://huggingface.co/papers/2403.04692). ## PixArtTransformer2DModel diff --git a/docs/source/en/api/models/sd3_transformer2d.md b/docs/source/en/api/models/sd3_transformer2d.md index 1f599b93e3..feef87db3a 100644 --- a/docs/source/en/api/models/sd3_transformer2d.md +++ b/docs/source/en/api/models/sd3_transformer2d.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # SD3 Transformer Model -The Transformer model introduced in [Stable Diffusion 3](https://hf.co/papers/2403.03206). Its novelty lies in the MMDiT transformer block. +The Transformer model introduced in [Stable Diffusion 3](https://hf.co/papers/2403.03206). Its novelty lies in the MMDiT transformer block. ## SD3Transformer2DModel diff --git a/docs/source/en/api/pipelines/animatediff.md b/docs/source/en/api/pipelines/animatediff.md index b21650aa2a..aa1264d8cb 100644 --- a/docs/source/en/api/pipelines/animatediff.md +++ b/docs/source/en/api/pipelines/animatediff.md @@ -78,7 +78,6 @@ output = pipe( ) frames = output.frames[0] export_to_gif(frames, "animation.gif") - ``` Here are some sample outputs: @@ -303,7 +302,6 @@ output = pipe( ) frames = output.frames[0] export_to_gif(frames, "animation.gif") - ``` @@ -378,7 +376,6 @@ output = pipe( ) frames = output.frames[0] export_to_gif(frames, "animation.gif") - ```
diff --git a/docs/source/en/api/pipelines/audioldm2.md b/docs/source/en/api/pipelines/audioldm2.md index ac4459c607..9f2b7529d4 100644 --- a/docs/source/en/api/pipelines/audioldm2.md +++ b/docs/source/en/api/pipelines/audioldm2.md @@ -20,8 +20,8 @@ The abstract of the paper is the following: *Although audio generation shares commonalities across different types of audio, such as speech, music, and sound effects, designing models for each type requires careful consideration of specific objectives and biases that can significantly differ from those of other types. To bring us closer to a unified perspective of audio generation, this paper proposes a framework that utilizes the same learning method for speech, music, and sound effect generation. Our framework introduces a general representation of audio, called "language of audio" (LOA). Any audio can be translated into LOA based on AudioMAE, a self-supervised pre-trained representation learning model. In the generation process, we translate any modalities into LOA by using a GPT-2 model, and we perform self-supervised audio generation learning with a latent diffusion model conditioned on LOA. The proposed framework naturally brings advantages such as in-context learning abilities and reusable self-supervised pretrained AudioMAE and latent diffusion models. Experiments on the major benchmarks of text-to-audio, text-to-music, and text-to-speech demonstrate state-of-the-art or competitive performance against previous approaches. Our code, pretrained model, and demo are available at [this https URL](https://audioldm.github.io/audioldm2).* -This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be -found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). +This pipeline was contributed by [sanchit-gandhi](https://huggingface.co/sanchit-gandhi) and [Nguyễn Công Tú Anh](https://github.com/tuanh123789). The original codebase can be +found at [haoheliu/audioldm2](https://github.com/haoheliu/audioldm2). ## Tips diff --git a/docs/source/en/api/pipelines/blip_diffusion.md b/docs/source/en/api/pipelines/blip_diffusion.md index ada47ca8c4..b4504f6d6b 100644 --- a/docs/source/en/api/pipelines/blip_diffusion.md +++ b/docs/source/en/api/pipelines/blip_diffusion.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # BLIP-Diffusion -BLIP-Diffusion was proposed in [BLIP-Diffusion: Pre-trained Subject Representation for Controllable Text-to-Image Generation and Editing](https://arxiv.org/abs/2305.14720). It enables zero-shot subject-driven generation and control-guided zero-shot generation. +BLIP-Diffusion was proposed in [BLIP-Diffusion: Pre-trained Subject Representation for Controllable Text-to-Image Generation and Editing](https://arxiv.org/abs/2305.14720). It enables zero-shot subject-driven generation and control-guided zero-shot generation. The abstract from the paper is: diff --git a/docs/source/en/api/pipelines/hunyuandit.md b/docs/source/en/api/pipelines/hunyuandit.md index 607d0d9542..c63c3bf0a9 100644 --- a/docs/source/en/api/pipelines/hunyuandit.md +++ b/docs/source/en/api/pipelines/hunyuandit.md @@ -36,7 +36,7 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m ## Optimization -You can optimize the pipeline's runtime and memory consumption with torch.compile and feed-forward chunking. To learn about other optimization methods, check out the [Speed up inference](../../optimization/fp16) and [Reduce memory usage](../../optimization/memory) guides. +You can optimize the pipeline's runtime and memory consumption with torch.compile and feed-forward chunking. To learn about other optimization methods, check out the [Speed up inference](../../optimization/fp16) and [Reduce memory usage](../../optimization/memory) guides. ### Inference @@ -46,7 +46,7 @@ First, load the pipeline: ```python from diffusers import HunyuanDiTPipeline -import torch +import torch pipeline = HunyuanDiTPipeline.from_pretrained( "Tencent-Hunyuan/HunyuanDiT-Diffusers", torch_dtype=torch.float16 @@ -78,7 +78,7 @@ Without torch.compile(): Average inference time: 20.570 seconds. ### Memory optimization -By loading the T5 text encoder in 8 bits, you can run the pipeline in just under 6 GBs of GPU VRAM. Refer to [this script](https://gist.github.com/sayakpaul/3154605f6af05b98a41081aaba5ca43e) for details. +By loading the T5 text encoder in 8 bits, you can run the pipeline in just under 6 GBs of GPU VRAM. Refer to [this script](https://gist.github.com/sayakpaul/3154605f6af05b98a41081aaba5ca43e) for details. Furthermore, you can use the [`~HunyuanDiT2DModel.enable_forward_chunking`] method to reduce memory usage. Feed-forward chunking runs the feed-forward layers in a transformer block in a loop instead of all at once. This gives you a trade-off between memory consumption and inference runtime. @@ -92,4 +92,4 @@ Furthermore, you can use the [`~HunyuanDiT2DModel.enable_forward_chunking`] meth [[autodoc]] HunyuanDiTPipeline - all - __call__ - + diff --git a/docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md b/docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md index 07e34bd4d3..77e77f8ede 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md +++ b/docs/source/en/api/pipelines/stable_diffusion/k_diffusion.md @@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License. # K-Diffusion -[k-diffusion](https://github.com/crowsonkb/k-diffusion) is a popular library created by [Katherine Crowson](https://github.com/crowsonkb/). We provide `StableDiffusionKDiffusionPipeline` and `StableDiffusionXLKDiffusionPipeline` that allow you to run Stable DIffusion with samplers from k-diffusion. +[k-diffusion](https://github.com/crowsonkb/k-diffusion) is a popular library created by [Katherine Crowson](https://github.com/crowsonkb/). We provide `StableDiffusionKDiffusionPipeline` and `StableDiffusionXLKDiffusionPipeline` that allow you to run Stable DIffusion with samplers from k-diffusion. Note that most the samplers from k-diffusion are implemented in Diffusers and we recommend using existing schedulers. You can find a mapping between k-diffusion samplers and schedulers in Diffusers [here](https://huggingface.co/docs/diffusers/api/schedulers/overview) diff --git a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md index 64cfdde54b..23830462c2 100644 --- a/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md +++ b/docs/source/en/api/pipelines/stable_diffusion/ldm3d_diffusion.md @@ -12,11 +12,11 @@ specific language governing permissions and limitations under the License. # Text-to-(RGB, depth) -LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://huggingface.co/papers/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, and Vasudev Lal. LDM3D generates an image and a depth map from a given text prompt unlike the existing text-to-image diffusion models such as [Stable Diffusion](./overview) which only generates an image. With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps. +LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://huggingface.co/papers/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, and Vasudev Lal. LDM3D generates an image and a depth map from a given text prompt unlike the existing text-to-image diffusion models such as [Stable Diffusion](./overview) which only generates an image. With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps. Two checkpoints are available for use: - [ldm3d-original](https://huggingface.co/Intel/ldm3d). The original checkpoint used in the [paper](https://arxiv.org/pdf/2305.10853.pdf) -- [ldm3d-4c](https://huggingface.co/Intel/ldm3d-4c). The new version of LDM3D using 4 channels inputs instead of 6-channels inputs and finetuned on higher resolution images. +- [ldm3d-4c](https://huggingface.co/Intel/ldm3d-4c). The new version of LDM3D using 4 channels inputs instead of 6-channels inputs and finetuned on higher resolution images. The abstract from the paper is: @@ -44,7 +44,7 @@ Make sure to check out the Stable Diffusion [Tips](overview#tips) section to lea # Upscaler -[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. +[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. The abstract from the paper is: *Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods* diff --git a/docs/source/en/api/pipelines/text_to_video_zero.md b/docs/source/en/api/pipelines/text_to_video_zero.md index 1f8688a722..375592bb34 100644 --- a/docs/source/en/api/pipelines/text_to_video_zero.md +++ b/docs/source/en/api/pipelines/text_to_video_zero.md @@ -155,28 +155,28 @@ To generate a video from prompt with additional pose control imageio.mimsave("video.mp4", result, fps=4) ``` - #### SDXL Support - + Since our attention processor also works with SDXL, it can be utilized to generate a video from prompt using ControlNet models powered by SDXL: ```python import torch from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor - + controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0' model_id = 'stabilityai/stable-diffusion-xl-base-1.0' - + controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16) pipe = StableDiffusionControlNetPipeline.from_pretrained( model_id, controlnet=controlnet, torch_dtype=torch.float16 ).to('cuda') - + # Set the attention processor pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2)) - + # fix latents for all frames latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1) - + prompt = "Darth Vader dancing in a desert" result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images imageio.mimsave("video.mp4", result, fps=4) diff --git a/docs/source/en/api/schedulers/tcd.md b/docs/source/en/api/schedulers/tcd.md index 3df7390391..27fc111d64 100644 --- a/docs/source/en/api/schedulers/tcd.md +++ b/docs/source/en/api/schedulers/tcd.md @@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o specific language governing permissions and limitations under the License. --> -# TCDScheduler +# TCDScheduler [Trajectory Consistency Distillation](https://huggingface.co/papers/2402.19159) by Jianbin Zheng, Minghui Hu, Zhongyi Fan, Chaoyue Wang, Changxing Ding, Dacheng Tao and Tat-Jen Cham introduced a Strategic Stochastic Sampling (Algorithm 4) that is capable of generating good samples in a small number of steps. Distinguishing it as an advanced iteration of the multistep scheduler (Algorithm 1) in the [Consistency Models](https://huggingface.co/papers/2303.01469), Strategic Stochastic Sampling specifically tailored for the trajectory consistency function. diff --git a/docs/source/en/training/adapt_a_model.md b/docs/source/en/training/adapt_a_model.md index 57bc1a37e0..f3429d8c24 100644 --- a/docs/source/en/training/adapt_a_model.md +++ b/docs/source/en/training/adapt_a_model.md @@ -26,7 +26,7 @@ pipeline.unet.config["in_channels"] 9 ``` -To adapt your text-to-image model for inpainting, you'll need to change the number of `in_channels` from 4 to 9. +To adapt your text-to-image model for inpainting, you'll need to change the number of `in_channels` from 4 to 9. Initialize a [`UNet2DConditionModel`] with the pretrained text-to-image model weights, and change `in_channels` to 9. Changing the number of `in_channels` means you need to set `ignore_mismatched_sizes=True` and `low_cpu_mem_usage=False` to avoid a size mismatch error because the shape is different now. diff --git a/docs/source/en/training/create_dataset.md b/docs/source/en/training/create_dataset.md index f215d3eb2c..0ec521f01c 100644 --- a/docs/source/en/training/create_dataset.md +++ b/docs/source/en/training/create_dataset.md @@ -9,7 +9,7 @@ This guide will show you two ways to create a dataset to finetune on: -💡 Learn more about how to create an image dataset for training in the [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset) guide. +💡 Learn more about how to create an image dataset for training in the [Create an image dataset](https://huggingface.co/docs/datasets/image_dataset) guide. @@ -39,7 +39,7 @@ accelerate launch train_unconditional.py \ -Start by creating a dataset with the [`ImageFolder`](https://huggingface.co/docs/datasets/image_load#imagefolder) feature, which creates an `image` column containing the PIL-encoded images. +Start by creating a dataset with the [`ImageFolder`](https://huggingface.co/docs/datasets/image_load#imagefolder) feature, which creates an `image` column containing the PIL-encoded images. You can use the `data_dir` or `data_files` parameters to specify the location of the dataset. The `data_files` parameter supports mapping specific files to dataset splits like `train` or `test`: diff --git a/docs/source/en/training/distributed_inference.md b/docs/source/en/training/distributed_inference.md index 40876a26e6..26b79cf09b 100644 --- a/docs/source/en/training/distributed_inference.md +++ b/docs/source/en/training/distributed_inference.md @@ -55,7 +55,7 @@ To learn more, take a look at the [Distributed Inference with 🤗 Accelerate](h ### Device placement > [!WARNING] -> This feature is experimental and its APIs might change in the future. +> This feature is experimental and its APIs might change in the future. With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU. @@ -90,8 +90,8 @@ import torch max_memory = {0:"1GB", 1:"1GB"} pipeline = DiffusionPipeline.from_pretrained( "runwayml/stable-diffusion-v1-5", - torch_dtype=torch.float16, - use_safetensors=True, + torch_dtype=torch.float16, + use_safetensors=True, device_map="balanced", + max_memory=max_memory ) @@ -99,7 +99,7 @@ image = pipeline("a dog").images[0] image ``` -If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement. +If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement. By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`]. diff --git a/docs/source/en/training/dreambooth.md b/docs/source/en/training/dreambooth.md index 4c6955f58a..28412fe957 100644 --- a/docs/source/en/training/dreambooth.md +++ b/docs/source/en/training/dreambooth.md @@ -533,7 +533,7 @@ python train_dreambooth_lora.py \ --resolution=256 \ --train_batch_size=4 \ --gradient_accumulation_steps=1 \ - --learning_rate=1e-6 \ + --learning_rate=1e-6 \ --max_train_steps=2000 \ --validation_prompt="a sks dog" \ --validation_epochs=100 \ diff --git a/docs/source/en/tutorials/fast_diffusion.md b/docs/source/en/tutorials/fast_diffusion.md index f827d118ca..9338bb0a0f 100644 --- a/docs/source/en/tutorials/fast_diffusion.md +++ b/docs/source/en/tutorials/fast_diffusion.md @@ -222,7 +222,7 @@ First, configure all the compiler tags: ```python from diffusers import StableDiffusionXLPipeline -import torch +import torch # Notice the two new flags at the end. torch._inductor.config.conv_1x1_as_mm = True diff --git a/docs/source/en/using-diffusers/controlnet.md b/docs/source/en/using-diffusers/controlnet.md index 2a1295d14d..20b7f94290 100644 --- a/docs/source/en/using-diffusers/controlnet.md +++ b/docs/source/en/using-diffusers/controlnet.md @@ -506,7 +506,7 @@ make_image_grid([original_image, canny_image], rows=1, cols=2) For human pose estimation, install [controlnet_aux](https://github.com/patrickvonplaten/controlnet_aux): - + ```py # uncomment to install the necessary library in Colab #!pip install -q controlnet-aux diff --git a/docs/source/en/using-diffusers/custom_pipeline_overview.md b/docs/source/en/using-diffusers/custom_pipeline_overview.md index ef26e546e4..341a98a5c8 100644 --- a/docs/source/en/using-diffusers/custom_pipeline_overview.md +++ b/docs/source/en/using-diffusers/custom_pipeline_overview.md @@ -147,11 +147,11 @@ prompt = "cat, hiding in the leaves, ((rain)), zazie rainyday, beautiful eyes, m neg_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation" generator = torch.Generator(device="cpu").manual_seed(20) out_lpw = pipe_lpw( - prompt, - negative_prompt=neg_prompt, + prompt, + negative_prompt=neg_prompt, width=512, height=512, - max_embeddings_multiples=3, + max_embeddings_multiples=3, num_inference_steps=50, generator=generator, ).images[0] diff --git a/docs/source/en/using-diffusers/inference_with_lcm.md b/docs/source/en/using-diffusers/inference_with_lcm.md index 19fb349c54..ff436a655f 100644 --- a/docs/source/en/using-diffusers/inference_with_lcm.md +++ b/docs/source/en/using-diffusers/inference_with_lcm.md @@ -235,7 +235,7 @@ image = pipe( mask_image=mask_image, generator=generator, num_inference_steps=4, - guidance_scale=4, + guidance_scale=4, ).images[0] image ``` @@ -497,7 +497,7 @@ pipe = StableDiffusionXLAdapterPipeline.from_pretrained( unet=unet, adapter=adapter, torch_dtype=torch.float16, - variant="fp16", + variant="fp16", ).to("cuda") pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) @@ -512,7 +512,7 @@ image = pipe( image=canny_image, num_inference_steps=4, guidance_scale=5, - adapter_conditioning_scale=0.8, + adapter_conditioning_scale=0.8, adapter_conditioning_factor=1, generator=generator, ).images[0] @@ -554,10 +554,10 @@ canny_image = Image.fromarray(image).resize((1024, 1024)) adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda") pipe = StableDiffusionXLAdapterPipeline.from_pretrained( - "stabilityai/stable-diffusion-xl-base-1.0", + "stabilityai/stable-diffusion-xl-base-1.0", adapter=adapter, torch_dtype=torch.float16, - variant="fp16", + variant="fp16", ).to("cuda") pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config) @@ -573,8 +573,8 @@ image = pipe( negative_prompt=negative_prompt, image=canny_image, num_inference_steps=4, - guidance_scale=1.5, - adapter_conditioning_scale=0.8, + guidance_scale=1.5, + adapter_conditioning_scale=0.8, adapter_conditioning_factor=1, generator=generator, ).images[0] diff --git a/docs/source/en/using-diffusers/ip_adapter.md b/docs/source/en/using-diffusers/ip_adapter.md index 02fb0c34aa..0c49ac2aa1 100644 --- a/docs/source/en/using-diffusers/ip_adapter.md +++ b/docs/source/en/using-diffusers/ip_adapter.md @@ -445,8 +445,8 @@ generator = torch.Generator(device="cpu").manual_seed(42) images = pipeline( prompt="A photo of a girl", - ip_adapter_image_embeds=[id_embeds], - negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", + ip_adapter_image_embeds=[id_embeds], + negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", num_inference_steps=20, num_images_per_prompt=1, generator=generator ).images @@ -661,7 +661,7 @@ image ### Style & layout control -[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model. +[InstantStyle](https://arxiv.org/abs/2404.02733) is a plug-and-play method on top of IP-Adapter, which disentangles style and layout from image prompt to control image generation. This way, you can generate images following only the style or layout from image prompt, with significantly improved diversity. This is achieved by only activating IP-Adapters to specific parts of the model. By default IP-Adapters are inserted to all layers of the model. Use the [`~loaders.IPAdapterMixin.set_ip_adapter_scale`] method with a dictionary to assign scales to IP-Adapter at different layers. diff --git a/docs/source/en/using-diffusers/loading.md b/docs/source/en/using-diffusers/loading.md index b2f254349f..dca5e71edd 100644 --- a/docs/source/en/using-diffusers/loading.md +++ b/docs/source/en/using-diffusers/loading.md @@ -81,14 +81,14 @@ pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("runwayml/stable-diffu Use the Space below to gauge a pipeline's memory requirements before you download and load it to see if it runs on your hardware.
-