huggingface · yiyixuxu · Apr 22, 2024 · Apr 14, 2024 · Apr 17, 2024 · Apr 18, 2024
diff --git a/src/diffusers/loaders/ip_adapter.py b/src/diffusers/loaders/ip_adapter.py
@@ -27,6 +27,7 @@
     is_transformers_available,
     logging,
 )
+from .unet_loader_utils import _maybe_expand_lora_scales
 
 
 if is_transformers_available():
@@ -228,27 +229,74 @@ def load_ip_adapter(
         unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
         unet._load_ip_adapter_weights(state_dicts, low_cpu_mem_usage=low_cpu_mem_usage)
 
-    def set_ip_adapter_scale(self, scale):
+    def set_ip_adapter_scale(self, scale_configs: Union[float, Dict, List[Union[float, Dict]]], default_scale=0.0):
         """
-        Sets the conditioning scale between text and image.
+        Set IP-Adapter scales per-transformer block. Input `scale_configs` could be a single config or a list of configs
+        for granular control over each IP-Adapter behavior. A config can be a float or a dictionary.
 
         Example:
 
         ```py
-        pipeline.set_ip_adapter_scale(0.5)
+        # To use original IP-Adapter
+        scale_configs = 1.0
+        pipeline.set_ip_adapter_scale(scale_configs)
+
+        # To use style block only
+        scale_configs = {
+            "up": {
+                "block_0": [0.0, 1.0, 0.0]
+            },
+        }
+        pipeline.set_ip_adapter_scale(scale_configs)
+
+        # To use style+layout blocks
+        scale_configs = {
+            "down": {
+                "block_2": [0.0, 1.0]
+            },
+            "up": {
+                "block_0": [0.0, 1.0, 0.0]
+            },
+        }
+        pipeline.set_ip_adapter_scale(scale_configs)
+
+        # To use style and layout from 2 reference images
+        scale_configs = [
+            {
+                "down": {
+                    "block_2": [0.0, 1.0]
+                }
+            },
+            {
+                "up": {
+                    "block_0": [0.0, 1.0, 0.0]
+                }
+            }
+        ]
+        pipeline.set_ip_adapter_scale(scale_configs)
         ```
         """
         unet = getattr(self, self.unet_name) if not hasattr(self, "unet") else self.unet
-        for attn_processor in unet.attn_processors.values():
+        if not isinstance(scale_configs, list):
+            scale_configs = [scale_configs]
+        scale_configs = _maybe_expand_lora_scales(unet, scale_configs, default_scale=default_scale)
+
+        for attn_name, attn_processor in unet.attn_processors.items():
             if isinstance(attn_processor, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0)):
-                if not isinstance(scale, list):
-                    scale = [scale] * len(attn_processor.scale)
-                if len(attn_processor.scale) != len(scale):
+                if len(scale_configs)>1 and len(scale_configs)!=len(attn_processor.scale):
                     raise ValueError(
-                        f"`scale` should be a list of same length as the number if ip-adapters "
-                        f"Expected {len(attn_processor.scale)} but got {len(scale)}."
+                        f"Cannot assign {len(scale_configs)} scale_configs to "
+                        f"{len(attn_processor.scale)} IP-Adapter."
                     )
-                attn_processor.scale = scale
+                elif len(scale_configs)==1:
+                    scale_configs = scale_configs * len(attn_processor.scale)
+                for i, scale_config in enumerate(scale_configs):
+                    if isinstance(scale_config, dict):
+                        for key, scale in scale_config.items():
+                            if attn_name.startswith(key):
+                                attn_processor.scale[i] = scale
+                    else:
+                        attn_processor.scale[i] = scale_config
 
     def unload_ip_adapter(self):
         """

diff --git a/src/diffusers/loaders/unet_loader_utils.py b/src/diffusers/loaders/unet_loader_utils.py
@@ -38,7 +38,7 @@ def _translate_into_actual_layer_name(name):
     return ".".join((updown, block, attn))
 
 
-def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]]):
+def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[Union[float, Dict]], default_scale=1.0):
     blocks_with_transformer = {
         "down": [i for i, block in enumerate(unet.down_blocks) if hasattr(block, "attentions")],
         "up": [i for i, block in enumerate(unet.up_blocks) if hasattr(block, "attentions")],
@@ -47,7 +47,7 @@ def _maybe_expand_lora_scales(unet: "UNet2DConditionModel", weight_scales: List[
 
     expanded_weight_scales = [
         _maybe_expand_lora_scales_for_one_adapter(
-            weight_for_adapter, blocks_with_transformer, transformer_per_block, unet.state_dict()
+            weight_for_adapter, blocks_with_transformer, transformer_per_block, unet.state_dict(), default_scale=default_scale
         )
         for weight_for_adapter in weight_scales
     ]
@@ -60,6 +60,7 @@ def _maybe_expand_lora_scales_for_one_adapter(
     blocks_with_transformer: Dict[str, int],
     transformer_per_block: Dict[str, int],
     state_dict: None,
+    default_scale: float=1.0,
 ):
     """
     Expands the inputs into a more granular dictionary. See the example below for more details.
@@ -108,21 +109,27 @@ def _maybe_expand_lora_scales_for_one_adapter(
     scales = copy.deepcopy(scales)
 
     if "mid" not in scales:
-        scales["mid"] = 1
+        scales["mid"] = default_scale
 
     for updown in ["up", "down"]:
         if updown not in scales:
-            scales[updown] = 1
+            scales[updown] = default_scale
 
         # eg {"down": 1} to {"down": {"block_1": 1, "block_2": 1}}}
         if not isinstance(scales[updown], dict):
             scales[updown] = {f"block_{i}": scales[updown] for i in blocks_with_transformer[updown]}
 
-        # eg {"down": "block_1": 1}} to {"down": "block_1": [1, 1]}}
+        # eg {"down": {"block_1": 1}} to {"down": {"block_1": [1, 1]}}
         for i in blocks_with_transformer[updown]:
             block = f"block_{i}"
+            # set not assigned blocks to default scale
+            if block not in scales[updown]:
+                scales[updown][block] = default_scale
             if not isinstance(scales[updown][block], list):
                 scales[updown][block] = [scales[updown][block] for _ in range(transformer_per_block[updown])]
+            else:
+                assert len(scales[updown][block]) == transformer_per_block[updown], \
+                    f"Expected {transformer_per_block[updown]} scales for {updown}.{block}, got {len(scales[updown][block])}."
 
         # eg {"down": "block_1": [1, 1]}}  to {"down.block_1.0": 1, "down.block_1.1": 1}
         for i in blocks_with_transformer[updown]:

diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -2229,44 +2229,45 @@ def __call__(
         for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
             ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
         ):
-            if mask is not None:
-                if not isinstance(scale, list):
-                    scale = [scale]
+            if scale > 0:
+                if mask is not None:
+                    if not isinstance(scale, list):
+                        scale = [scale]
+
+                    current_num_images = mask.shape[1]
+                    for i in range(current_num_images):
+                        ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+
+                        ip_key = attn.head_to_batch_dim(ip_key)
+                        ip_value = attn.head_to_batch_dim(ip_value)
+
+                        ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
+                        _current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+                        _current_ip_hidden_states = attn.batch_to_head_dim(_current_ip_hidden_states)
+
+                        mask_downsample = IPAdapterMaskProcessor.downsample(
+                            mask[:, i, :, :],
+                            batch_size,
+                            _current_ip_hidden_states.shape[1],
+                            _current_ip_hidden_states.shape[2],
+                        )
+
+                        mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
 
-                current_num_images = mask.shape[1]
-                for i in range(current_num_images):
-                    ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
-                    ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+                        hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                else:
+                    ip_key = to_k_ip(current_ip_hidden_states)
+                    ip_value = to_v_ip(current_ip_hidden_states)
 
                     ip_key = attn.head_to_batch_dim(ip_key)
                     ip_value = attn.head_to_batch_dim(ip_value)
 
                     ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
-                    _current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
-                    _current_ip_hidden_states = attn.batch_to_head_dim(_current_ip_hidden_states)
-
-                    mask_downsample = IPAdapterMaskProcessor.downsample(
-                        mask[:, i, :, :],
-                        batch_size,
-                        _current_ip_hidden_states.shape[1],
-                        _current_ip_hidden_states.shape[2],
-                    )
-
-                    mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
-
-                    hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
-            else:
-                ip_key = to_k_ip(current_ip_hidden_states)
-                ip_value = to_v_ip(current_ip_hidden_states)
+                    current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
+                    current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states)
 
-                ip_key = attn.head_to_batch_dim(ip_key)
-                ip_value = attn.head_to_batch_dim(ip_value)
-
-                ip_attention_probs = attn.get_attention_scores(query, ip_key, None)
-                current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
-                current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states)
-
-                hidden_states = hidden_states + scale * current_ip_hidden_states
+                    hidden_states = hidden_states + scale * current_ip_hidden_states
 
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)
@@ -2439,57 +2440,58 @@ def __call__(
         for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
             ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
         ):
-            if mask is not None:
-                if not isinstance(scale, list):
-                    scale = [scale]
+            if scale > 0:
+                if mask is not None:
+                    if not isinstance(scale, list):
+                        scale = [scale]
+
+                    current_num_images = mask.shape[1]
+                    for i in range(current_num_images):
+                        ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
+                        ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+
+                        ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+                        ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+
+                        # the output of sdp = (batch, num_heads, seq_len, head_dim)
+                        # TODO: add support for attn.scale when we move to Torch 2.1
+                        _current_ip_hidden_states = F.scaled_dot_product_attention(
+                            query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
+                        )
+
+                        _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape(
+                            batch_size, -1, attn.heads * head_dim
+                        )
+                        _current_ip_hidden_states = _current_ip_hidden_states.to(query.dtype)
 
-                current_num_images = mask.shape[1]
-                for i in range(current_num_images):
-                    ip_key = to_k_ip(current_ip_hidden_states[:, i, :, :])
-                    ip_value = to_v_ip(current_ip_hidden_states[:, i, :, :])
+                        mask_downsample = IPAdapterMaskProcessor.downsample(
+                            mask[:, i, :, :],
+                            batch_size,
+                            _current_ip_hidden_states.shape[1],
+                            _current_ip_hidden_states.shape[2],
+                        )
+
+                        mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
+                        hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
+                else:
+                    ip_key = to_k_ip(current_ip_hidden_states)
+                    ip_value = to_v_ip(current_ip_hidden_states)
 
                     ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
                     ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
 
                     # the output of sdp = (batch, num_heads, seq_len, head_dim)
                     # TODO: add support for attn.scale when we move to Torch 2.1
-                    _current_ip_hidden_states = F.scaled_dot_product_attention(
+                    current_ip_hidden_states = F.scaled_dot_product_attention(
                         query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
                     )
 
-                    _current_ip_hidden_states = _current_ip_hidden_states.transpose(1, 2).reshape(
+                    current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
                         batch_size, -1, attn.heads * head_dim
                     )
-                    _current_ip_hidden_states = _current_ip_hidden_states.to(query.dtype)
-
-                    mask_downsample = IPAdapterMaskProcessor.downsample(
-                        mask[:, i, :, :],
-                        batch_size,
-                        _current_ip_hidden_states.shape[1],
-                        _current_ip_hidden_states.shape[2],
-                    )
-
-                    mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
-                    hidden_states = hidden_states + scale[i] * (_current_ip_hidden_states * mask_downsample)
-            else:
-                ip_key = to_k_ip(current_ip_hidden_states)
-                ip_value = to_v_ip(current_ip_hidden_states)
-
-                ip_key = ip_key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-                ip_value = ip_value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-                # the output of sdp = (batch, num_heads, seq_len, head_dim)
-                # TODO: add support for attn.scale when we move to Torch 2.1
-                current_ip_hidden_states = F.scaled_dot_product_attention(
-                    query, ip_key, ip_value, attn_mask=None, dropout_p=0.0, is_causal=False
-                )
-
-                current_ip_hidden_states = current_ip_hidden_states.transpose(1, 2).reshape(
-                    batch_size, -1, attn.heads * head_dim
-                )
-                current_ip_hidden_states = current_ip_hidden_states.to(query.dtype)
+                    current_ip_hidden_states = current_ip_hidden_states.to(query.dtype)
 
-                hidden_states = hidden_states + scale * current_ip_hidden_states
+                    hidden_states = hidden_states + scale * current_ip_hidden_states
 
         # linear proj
         hidden_states = attn.to_out[0](hidden_states)