Add streaming support for text generation

- Implemented streaming functionality for real-time text output.
- Added `_decode_stream` method to handle text streaming.
- Updated `chat` method to support streaming mode.
- Adjusted code to process and yield text in chunks for better responsiveness.

This update enhances the user experience by allowing incremental text generation and display.

Files changed (1) hide show

modeling_minicpm.py +64 -8

modeling_minicpm.py CHANGED Viewed

@@ -22,12 +22,14 @@ import math
 import warnings
 from typing import List, Optional, Tuple, Union, Dict
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
@@ -1248,6 +1250,9 @@ class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1426,11 +1431,52 @@ class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
     @torch.inference_mode()
-    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user",
-             max_length: int = 4096, num_beams=1, do_sample=True, top_p=0.8, temperature=0.3, logits_processor=None,
-             **kwargs):
         if history is None:
             history = []
         if logits_processor:
@@ -1443,12 +1489,22 @@ class MiniCPM3ForCausalLM(MiniCPM3PreTrainedModel):
         history.append({"role": role, "content": query})
         history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
-        outputs = self.generate(**inputs, **gen_kwargs)
-        outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
-        response = tokenizer.decode(outputs)
-        history.append({"role": "assistant", "content": response})
-        return response, history
 @add_start_docstrings(
     """

 import warnings
 from typing import List, Optional, Tuple, Union, Dict
+from threading import Thread
 import torch
 import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from transformers import TextIteratorStreamer
 from transformers.activations import ACT2FN
 from transformers.cache_utils import Cache, DynamicCache
 from transformers.modeling_attn_mask_utils import (
         self.vocab_size = config.vocab_size
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # List of terminator tokens used to indicate the end of a sequence or conversation.
+        self.terminators = ['</s>', '<|im_end|>']
         # Initialize weights and apply final processing
         self.post_init()
                 tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
             )
         return reordered_past
+    # Internal function to handle streaming of generated text using TextIteratorStreamer.
+    def _decode_stream(self, input_ids, tokenizer, **kwargs):
+        # Convert terminators to token IDs
+        terminators_ids = [tokenizer.convert_tokens_to_ids(i) for i in self.terminators]
+        # Initialize TextIteratorStreamer for handling streaming output
+        streamer = TextIteratorStreamer(tokenizer=tokenizer,skip_prompt=True, skip_special_tokens=True)
+        # Set up generation parameters, including input IDs, eos token IDs, and streamer
+        generation_kwargs = {
+            'input_ids': input_ids,
+            'eos_token_id': terminators_ids,
+            'streamer': streamer
+        }
+        generation_kwargs.update(kwargs)
+        # Run the generation task in a separate thread to enable streaming output
+        thread = Thread(target=self.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Return the streamer instance for later access to streamed text
+        return streamer
     @torch.inference_mode()
+    def chat(self, tokenizer, query: str, history: List[Dict] = None, role: str = "user", max_length: int = 4096, num_beams=1,
+             do_sample=True, logits_processor=None, stream=False, top_p=0.8, temperature=0.3, **kwargs):
+        """
+        Main function for handling dialogue generation based on the input query and history.
+        Parameters:
+        - tokenizer: Tokenizer instance used for encoding and decoding.
+        - query: The user input query string.
+        - history: Dialogue history, a list of dictionaries where each dictionary contains role and content.
+        - role: The current role, default is "user".
+        - max_length: Maximum length of the generated text.
+        - num_beams: Number of beams for beam search.
+        - do_sample: Whether to use sampling for generation.
+        - logits_processor: Function for processing logits (if any).
+        - stream: Whether to use streaming output.
+        - top_p: Nucleus sampling parameter.
+        - temperature: Temperature parameter for generation.
+        - **kwargs: Additional arguments for generation.
+        Returns:
+        - If stream is True, returns a generator function to get the generated text incrementally.
+        - If stream is False, returns the complete generated response string.
+        """
         if history is None:
             history = []
         if logits_processor:
         history.append({"role": role, "content": query})
         history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=True)
         inputs = tokenizer(history_str, return_tensors='pt').to(self.device)
+        if stream:
+            res = self._decode_stream(inputs["input_ids"], tokenizer, **gen_kwargs)
+            def stream_gen():
+                for text in res:
+                    # Remove terminators from the text
+                    for term in self.terminators:
+                        text = text.replace(term, '')
+                    yield text
+            return stream_gen()
+        else:
+            outputs = self.generate(**inputs, **gen_kwargs)
+            outputs = outputs.tolist()[0][len(inputs["input_ids"][0]):-1]
+            response = tokenizer.decode(outputs)
+            return response
 @add_start_docstrings(
     """