LazyAGI · wzh1994 · Jun 20, 2024 · Jun 14, 2024 · Jun 14, 2024 · Jun 14, 2024
diff --git a/README.ENG.md b/README.ENG.md
@@ -93,9 +93,8 @@ mweb = lazyllm.WebModule(ppl, port=23456).start().wait()
 
 ```python
 import lazyllm
-from lazyllm import pipeline, parallel, Identity, warp, package
-import time
-import re, json
+from lazyllm import pipeline, warp, bind
+from lazyllm.components.formatter import JsonFormatter
 
 toc_prompt="""
 You are now an intelligent assistant. Your task is to understand the user's input and convert the outline into a list of nested dictionaries. Each dictionary contains a `title` and a `describe`, where the `title` should clearly indicate the level using Markdown format, and the `describe` is a description and writing guide for that section.
@@ -134,19 +133,18 @@ This is the expanded content for writing.
 Receive as follows:
 
 """
+
+writer_prompt = {"system": completion_prompt, "user": '{"title": {title}, "describe": {describe}}'}
 ```
 </details>
 
 ```python
-t1 = lazyllm.OnlineChatModule(source="openai", stream=False, prompter=ChatPrompter(instruction=toc_prompt))
-t2 = lazyllm.OnlineChatModule(source="openai", stream=False, prompter=ChatPrompter(instruction=completion_prompt))
-
-spliter = lambda s: tuple(eval(re.search(r'\[\s*\{.*\}\s*\]', s['message']['content'], re.DOTALL).group()))
-writter = pipeline(lambda d: json.dumps(d, ensure_ascii=False), t2, lambda d : d['message']['content'])
-collector = lambda dict_tuple, repl_tuple: "\n".join([v for d in [{**d, "describe": repl_tuple[i]} for i, d in enumerate(dict_tuple)] for v in d.values()])
-m = pipeline(t1, spliter, parallel(Identity, warp(writter)), collector)
+with pipeline() as ppl:
+    ppl.outline_writer = lazyllm.OnlineChatModule(source="openai", stream=False).formatter(JsonFormatter()).prompt(toc_prompt)
+    ppl.story_generater = warp(lazyllm.OnlineChatModule(source="openai", stream=False).prompt(writer_prompt))
+    ppl.synthesizer = (lambda *storys, outlines: "\n".join([f"{o['title']}\n{s}" for s, o in zip(storys, outlines)])) | bind(outlines=ppl.outline_writer)
 
-print(m({'query': 'Please help me write an article about the application of artificial intelligence in the medical field.'}))
+print(ppl({'query': 'Please help me write an article about the application of artificial intelligence in the medical field.'}))
 ```
 
 ## What can LazyLLM do

diff --git a/README.md b/README.md
@@ -90,9 +90,8 @@ mweb = lazyllm.WebModule(ppl, port=23456).start().wait()
 
 ```python
 import lazyllm
-from lazyllm import pipeline, parallel, Identity, warp, package
-import time
-import re, json
+from lazyllm import pipeline, warp, bind
+from lazyllm.components.formatter import JsonFormatter
 
 toc_prompt=""" 你现在是一个智能助手。你的任务是理解用户的输入，将大纲以列表嵌套字典的列表。每个字典包含一个 `title` 和 `describe`，其中 `title` 中需要用Markdown格式标清层级，`describe` `describe` 是对该段的描述和写作指导。
 
@@ -129,19 +128,18 @@ completion_prompt="""
 接收如下：
 
 """
+
+writer_prompt = {"system": completion_prompt, "user": '{"title": {title}, "describe": {describe}}'}
 ```
 </details>
 
 ```python
-t1 = lazyllm.OnlineChatModule(source="openai", stream=False, prompter=ChatPrompter(instruction=toc_prompt))
-t2 = lazyllm.OnlineChatModule(source="openai", stream=False, prompter=ChatPrompter(instruction=completion_prompt))
-
-spliter = lambda s: tuple(eval(re.search(r'\[\s*\{.*\}\s*\]', s['message']['content'], re.DOTALL).group()))
-writter = pipeline(lambda d: json.dumps(d, ensure_ascii=False), t2, lambda d : d['message']['content'])
-collector = lambda dict_tuple, repl_tuple: "\n".join([v for d in [{**d, "describe": repl_tuple[i]} for i, d in enumerate(dict_tuple)] for v in d.values()])
-m = pipeline(t1, spliter, parallel(Identity, warp(writter)), collector)
+with pipeline() as ppl:
+    ppl.outline_writer = lazyllm.OnlineChatModule(source="openai", stream=False).formatter(JsonFormatter()).prompt(toc_prompt)
+    ppl.story_generater = warp(lazyllm.OnlineChatModule(source="openai", stream=False).prompt(writer_prompt))
+    ppl.synthesizer = (lambda *storys, outlines: "\n".join([f"{o['title']}\n{s}" for s, o in zip(storys, outlines)])) | bind(outlines=ppl.outline_writer)
 
-print(m({'query':'请帮我写一篇关于人工智能在医疗领域应用的文章。'}))
+print(ppl({'query':'请帮我写一篇关于人工智能在医疗领域应用的文章。'}))
 ```
 
 ## 四、功能点

diff --git a/docs/source/api/components.rst b/docs/source/api/components.rst
@@ -60,3 +60,18 @@ ModelDownloader
 .. autoclass:: lazyllm.components.ModelDownloader
     :members: 
     :exclude-members:
+
+Formatter
+==========
+
+.. autoclass:: lazyllm.components.formatter.LazyLLMFormatterBase
+    :members:
+    :exclude-members:
+
+.. autoclass:: lazyllm.components.JsonFormatter
+    :members:
+    :exclude-members:
+
+.. autoclass:: lazyllm.components.EmptyFormatter
+    :members:
+    :exclude-members:
diff --git a/docs/source/best_practice/prompt.rst b/docs/source/best_practice/prompt.rst
@@ -53,18 +53,19 @@ LazyLLM Prompter的设计思路
 
 - PrompterTemplate中可选的字段有：
     - system: 系统提示，一般会读取模型的归属信息并进行设置，如不设置默认为 ``You are an AI-Agent developed by LazyLLM.`` 。
-    - instruction: 任务指令，由 ``InstructionTemplate`` 拼接用户的输入得到。这个是应用开发者需要着重了解的字段。
+    - instruction: 任务指令，由 ``InstructionTemplate`` 拼接用户的输入得到。这个是应用开发者需要着重了解的字段。如果instruction是字符串，则默认是系统指令，如果是字典，且其键值只能是 ``system`` 和 ``user`` 。``system`` 指定的是系统级指令， ``user`` 指定的是用户级指令。
     - history: 历史对话，由用户的输入得到，格式为 ``[[a, b], [c, d]]`` 或 ``[{"role": "user", "content": ""}, {"role": "assistant", "content": ""}]``
     - tools: 可以使用的工具，在构造 ``prompter`` 时传入或者由用户使用时传入，当构造 ``prompter`` 时定义了工具之后，将禁止用户使用时再次传入。格式为 ``[{"type": "function",  "function": {"name": "", "description": "", "parameters": {},  "required": []}]``
+    - user: 用户级指令，可选指令，由用户通过instruction指定。
     - sos: ``start of system`` , 标志着系统提示的开始，该符号由模型填入，开发者和用户均无需考虑
     - eos: ``end of system`` , 标志着系统提示的结束，该符号由模型填入，开发者和用户均无需考虑
     - soh: ``start of human`` , 标志着用户输入的开始，常用于多轮对话中作为分隔符。该符号由模型填入，开发者和用户均无需考虑
     - eoh: ``end of human`` , 标志着用户输入的结束，常用于多轮对话中作为分隔符。该符号由模型填入，开发者和用户均无需考虑
     - soa: ``start of assistant`` , 标志着模型输出的开始，常用于多轮对话中作为分隔符。该符号由模型填入，开发者和用户均无需考虑
     - eoa: ``end of assistant`` , 标志着模型输出的结束，常用于多轮对话中作为分隔符。该符号由模型填入，开发者和用户均无需考虑
 - ``TrainableModule`` 所使用的内置的Prompt的拼接规则如下：
-    - AlpacaPrompter: ``{system}\n{instruction}\n{tools}### Response:\n``
-    - ChatPrompter: ``{sos}{system}{instruction}{tools}{eos}\n\n{history}\n{soh}\n{input}\n{eoh}{soa}\n``
+    - AlpacaPrompter: ``{system}\n{instruction}\n{tools}\n{user}### Response:\n``
+    - ChatPrompter: ``{sos}{system}{instruction}{tools}{eos}\n\n{history}\n{soh}\n{user}{input}\n{eoh}{soa}\n``
 - ``OnlineChatModule`` 的输出格式为: ``dict(messages=[{"role": "system", "content": ""}, {"role": "user", "content": ""}, ...], tools=[])``
 
 .. note::
@@ -74,7 +75,7 @@ LazyLLM Prompter的设计思路
 
 **InstructionTemplate**: 每个Prompter内置的，用于结合用户输入的 ``instruction`` ，产生最终的 ``instruction`` 的模板。 ``InstructionTemplate`` 中的用到的2个字段是：
 
-- ``instruction`` : 由开发者在构造 ``Prompter`` 时传入，可带若干个待填充的槽位，用于填充用户的输入。
+- ``instruction`` : 由开发者在构造 ``Prompter`` 时传入，可带若干个待填充的槽位，用于填充用户的输入。或者指定系统级指令和用户级指令，当指定用户级指令时，需要使用字典类型，且键值为 ``user`` 和 ``system`` 。
 - ``extro_keys`` : 需要用户调用大模型时额外提供的信息，有开发者在构造 ``Prompter`` 时传入，会自动转换成 ``instruction`` 中的槽位。
 
 .. note::
@@ -105,11 +106,11 @@ Prompt生成过程解析
         "Below is an instruction that describes a task, paired with extra messages such as input that provides "
         "further context if possible. Write a response that appropriately completes the request.\\n\\n ### "
         "Instruction:\\n 你是一个由LazyLLM开发的知识问答助手，你的任务是根据提供的上下文信息来回答用户的问题。上下文信息是背景，"
-        "用户的问题是输入, 现在请你做出回答。### Response:\\n}"
+        "用户的问题是问题, 现在请你做出回答。### Response:\\n}"
 
 4. ``AlpacaPrompter`` 读取 ``system`` 和 ``tools`` 字段，其中 ``system`` 字段由 ``Module`` 设置，而 ``tools`` 字段则会在后面的 :ref:`bestpractice.prompt.tools` 一节中介绍。
 5. 如果 ``prompter`` 的结果用于线上模型（ ``OnlineChatModule`` ），则不会再进一步拼接 ``PromptTemplate`` ，而是会直接得到一个dict，即 ``{'messages': [{'role': 'system', 'content': 'You are an AI-Agent developed by LazyLLM.\nBelow is an instruction that describes a task, paired with extra messages such as input that provides further context if possible. Write a response that appropriately completes the request.\n\n ### Instruction:\n你是一个由LazyLLM开发的知识问答助手，你的任务是根据提供的上下文信息来回答用户的问题。上下文信息是背景，用户的问题是输入，现在请你做出回答。\n\n'}, {'role': 'user', 'content': ''}]}``
-6. 如果 ``prompter`` 的结果用于线下模型（ ``TrainableModule`` ），则会通过 ``PromptTemplate`` 得到最终的结果： ``You are an AI-Agent developed by LazyLLM.\nBelow is an instruction that describes a task, paired with extra messages such as input that provides further context if possible. Write a response that appropriately completes the request.\n\n ### Instruction:\n你是一个由LazyLLM开发的知识问答助手，你的任务是根据提供的上下文信息来回答用户的问题。上下文信息是背景，用户的问题是输入，现在请你做出回答。\n\n\n### Response:\n``
+6. 如果 ``prompter`` 的结果用于线下模型（ ``TrainableModule`` ），则会通过 ``PromptTemplate`` 得到最终的结果： ``You are an AI-Agent developed by LazyLLM.\nBelow is an instruction that describes a task, paired with extra messages such as input that provides further context if possible. Write a response that appropriately completes the request.\n\n ### Instruction:\n你是一个由LazyLLM开发的知识问答助手，你的任务是根据提供的上下文信息来回答用户的问题。上下文信息是背景，用户的问题是问题，现在请你做出回答。\n\n\n### Response:\n``
 
 定义和使用Prompter
 -------------------------
@@ -153,6 +154,7 @@ Query为string，而非dict
     - 当使用 ``ChatPrompter`` 时，不同于 ``AlpacaPrompter`` ，在 ``instruction`` 中定义槽位不是必须的。
     - 如果不定义槽位，则输入会放到对话中作为用户的输入，在 ``<soh>`` 和 ``<eoh>`` 之间。
     - 如果像 ``AlpacaPrompter`` 一样定义了槽位，也可以任意取一个名字，此时输入会放到 ``<system>`` 字段中。
+    - 如果 ``instruction`` 中指定了系统级指令和用户级指令，则在拼接完成后，系统级指令放在prompt_template中的{instruction}位置，用户级指令放在{user}位置。
 
 .. _bestpractice.prompt.tools:
 
@@ -286,4 +288,4 @@ Query为string，而非dict
     - ``TrainableModule`` 需要手动调用 ``start`` 以启动服务，想了解更多关于 ``TrainableModule`` 的用法，可以参考 :ref:`api.module`
 
 LazyLLM中内置的场景Prompt
--------------------------
+-------------------------
diff --git a/lazyllm/__init__.py b/lazyllm/__init__.py
@@ -7,7 +7,8 @@
                    Loop as loop, Switch as switch, IFS as ifs, Warp as warp)
 from .components import (LazyLLMDataprocBase, LazyLLMFinetuneBase, LazyLLMDeployBase,
                          LazyLLMValidateBase, register as component_register, Prompter,
-                         AlpacaPrompter, ChatPrompter, FastapiApp)
+                         AlpacaPrompter, ChatPrompter, FastapiApp, JsonFormatter)
+
 from .module import (ModuleBase, UrlModule, TrainableModule, ActionModule,
                      ServerModule, TrialModule, register as module_register,
                      OnlineChatModule, OnlineEmbeddingModule)
@@ -33,6 +34,7 @@
     'AlpacaPrompter',
     'ChatPrompter',
     'FastapiApp',
+    'JsonFormatter',
 
     # flow
     'LazyLLMFlowsBase',            # pipeline, parallel

diff --git a/lazyllm/common/common.py b/lazyllm/common/common.py
@@ -334,19 +334,19 @@ class LazyLlmRequest(struct):
 
     def split(self, flag=None):
         if flag is None:
-            assert len(self.kwargs) == 0 and isinstance(self.input, tuple), (
+            assert len(self.kwargs) == 0 and isinstance(self.input, (tuple, list)), (
                 f'Only tuple input can be split automatically, your input is {self.input} <{type(self.input)}>')
             return [LazyLlmRequest(input=inp, global_parameters=self.global_parameters) for inp in self.input]
         elif isinstance(flag, int):
-            assert len(self.kwargs) == 0 and isinstance(self.input, tuple), (
+            assert len(self.kwargs) == 0 and isinstance(self.input, (tuple, list)), (
                 f'Only tuple input can be split automatically, your input is {self.input} <{type(self.input)}>')
             assert flag == len(self.input), 'input size mismatch with split number'
             return [LazyLlmRequest(input=inp, global_parameters=self.global_parameters) for inp in self.input]
         elif isinstance(flag, list):
             if isinstance(self.input, dict):
                 assert len(self.kwargs) == 0, 'Cannot provived input and kwargs at the same time for split'
                 d = self.input
-            elif isinstance(self.input, tuple):
+            elif isinstance(self.input, (tuple, list)):
                 return self.split(len(flag))
             else:
                 assert not self.input, 'Cannot provived input and kwargs at the same time for split'

diff --git a/lazyllm/common/logger.py b/lazyllm/common/logger.py
@@ -18,7 +18,7 @@
     "log_format",
     str,
     "{process}: <green>{time:YYYY-MM-DD HH:mm:ss}</green> {extra[name]} "
-    "<level>{level}</level>: ({name}) <cyan>{message}</cyan>",
+    "<level>{level}</level>: ({name}:{line}) <cyan>{message}</cyan>",
     "LOG_FORMAT",
 )
 lazyllm.config.add("log_dir", str, "~/.lazyllm", "LOG_DIR")

diff --git a/lazyllm/components/__init__.py b/lazyllm/components/__init__.py
@@ -6,6 +6,7 @@
 from .validate import LazyLLMValidateBase
 from .auto import AutoDeploy, AutoFinetune
 from .utils import ModelDownloader
+from .formatter import FormatterBase, EmptyFormatter, JsonFormatter
 
 __all__ = [
     'register',
@@ -19,5 +20,8 @@
     'FastapiApp',
     'AutoDeploy',
     'AutoFinetune',
-    'ModelDownloader'
+    'ModelDownloader',
+    'FormatterBase',
+    'EmptyFormatter',
+    'JsonFormatter'
 ]
diff --git a/lazyllm/components/formatter/__init__.py b/lazyllm/components/formatter/__init__.py
@@ -0,0 +1,10 @@
+from .formatterBase import LazyLLMFormatterBase, LazyLLMFormatterBase as FormatterBase, EmptyFormatter
+from .jsonFormatter import JsonFormatter
+
+
+__all__ = [
+    'LazyLLMFormatterBase',
+    'FormatterBase',
+    'EmptyFormatter',
+    'JsonFormatter'
+]
diff --git a/lazyllm/components/formatter/formatterBase.py b/lazyllm/components/formatter/formatterBase.py
@@ -0,0 +1,50 @@
+from ...common import LazyLLMRegisterMetaClass
+
+def is_number(s: str):
+    try:
+        int(s)
+        return True
+    except ValueError:
+        if s == "None" or len(s) == 0:
+            return False
+        else:
+            raise ValueError("Invalid number: " + s + ". You can enter an integer, None or an empyt string.")
+
+class LazyLLMFormatterBase(metaclass=LazyLLMRegisterMetaClass):
+    def __init__(self, formatter: str = None):
+        self._formatter = formatter
+        if self._formatter:
+            self._parse_formatter()
+        else:
+            self._slices = None
+
+    def _parse_formatter(self):
+        # Remove the surrounding brackets
+        slice_str = self._formatter.strip()[1:-1]
+        dimensions = slice_str.split(",")
+        slices = []
+
+        for dim in dimensions:
+            if ":" in dim:
+                parts = dim.split(":")
+                start = int(parts[0]) if is_number(parts[0]) else None
+                end = int(parts[1]) if len(parts) > 1 and is_number(parts[1]) else None
+                step = int(parts[2]) if len(parts) > 2 and is_number(parts[2]) else None
+                slices.append(slice(start, end, step))
+            else:
+                slices.append(dim.strip())
+        self._slices = slices
+
+    def _load(self, msg: str):
+        raise NotImplementedError("This parse str function is not implemented.")
+
+    def _parse_py_data_by_formatter(self, py_data):
+        raise NotImplementedError("This data parse function is not implemented.")
+
+    def format(self, msg):
+        if isinstance(msg, str): msg = self._load(msg)
+        return self._parse_py_data_by_formatter(msg)
+
+class EmptyFormatter(LazyLLMFormatterBase):
+    def format(self, msg):
+        return msg
diff --git a/lazyllm/components/formatter/jsonFormatter.py b/lazyllm/components/formatter/jsonFormatter.py
@@ -0,0 +1,57 @@
+import json
+from .formatterBase import LazyLLMFormatterBase as FormatterBase
+import lazyllm
+
+class JsonFormatter(FormatterBase):
+    def _extract_json_from_string(self, mixed_str: str):
+        json_objects = []
+        brace_level = 0
+        current_json = ""
+        in_string = False
+
+        for char in mixed_str:
+            if char == '"' and (len(current_json) == 0 or current_json[-1] != '\\'):
+                in_string = not in_string
+
+            if not in_string:
+                if char == '{':
+                    if brace_level == 0:
+                        current_json = ""
+                    brace_level += 1
+                elif char == '}':
+                    brace_level -= 1
+
+            if brace_level > 0 or (brace_level == 0 and char == '}'):
+                current_json += char
+
+            if brace_level == 0 and current_json:
+                try:
+                    json.loads(current_json)
+                    json_objects.append(current_json)
+                    current_json = ""
+                except json.JSONDecodeError:
+                    continue
+
+        return json_objects
+
+    def _load(self, msg: str):
+        # Convert str to json format
+        assert msg.count("{") == msg.count("}"), f"{msg} is not a valid json string."
+        try:
+            json_strs = self._extract_json_from_string(msg)
+            if len(json_strs) == 0:
+                raise TypeError(f"{msg} is not a valid json string.")
+            res = []
+            for json_str in json_strs:
+                res.append(json.loads(json_str))
+            return res if len(res) > 1 else res[0]
+        except Exception as e:
+            lazyllm.LOG.info(f"Error: {e}")
+            return ""
+
+    def _parse_py_data_by_formatter(self, data, *, slices=None):
+        if slices is None: slices = self._slices
+        if not slices: return data
+        if isinstance(slices[0], slice): return [self._parse_py_data_by_formatter(d, slices=slices[1:])
+                                                 for d in data[slices[0]]]
+        else: return self._parse_py_data_by_formatter(data[slices[0]], slices=slices[1:])