feat: refactor construct knowledge graph task (#29)

* refactor construct knowledge graph task * Update test_disambiguate_data.py
apache · Feb 25, 2024 · e598c0b · e598c0b
1 parent f767a5e
commit e598c0b
Show file tree

Hide file tree

Showing 13 changed files with 674 additions and 665 deletions.
diff --git a/hugegraph-llm/README.md b/hugegraph-llm/README.md
@@ -19,8 +19,65 @@ graph systems and large language models.
 
 ## Examples (knowledge graph construction by llm)
 
-1. Start the HugeGraph database, you can do it via Docker. Refer to this [link](https://hub.docker.com/r/hugegraph/hugegraph) for guidance
-2. Run example like `python hugegraph-llm/examples/build_kg_test.py`
+> 1. Start the HugeGraph database, you can do it via Docker. Refer to this [link](https://hub.docker.com/r/hugegraph/hugegraph) for guidance
+> 2. Run example like `python hugegraph-llm/examples/build_kg_test.py`
+> 
+> Note: If you need a proxy to access OpenAI's API, please set your HTTP proxy in `build_kg_test.py`.
 
-Note: If you need a proxy to access OpenAI's API, please set your HTTP proxy in `build_kg_test.py`.
+The `KgBuilder` class is used to construct a knowledge graph. Here is a brief usage guide:
 
+1. **Initialization**: The `KgBuilder` class is initialized with an instance of a language model. This can be obtained from the `LLMs` class.
+
+```python
+from hugegraph_llm.llms.init_llm import LLMs
+from hugegraph_llm.operators.kg_construction_task import KgBuilder
+
+TEXT = ""
+builder = KgBuilder(LLMs().get_llm())
+(
+    builder
+    .import_schema(from_hugegraph="talent_graph").print_result()
+    .extract_triples(TEXT).print_result()
+    .disambiguate_word_sense().print_result()
+    .commit_to_hugegraph()
+    .run()
+)
+```
+
+2. **Import Schema**: The `import_schema` method is used to import a schema from a source. The source can be a HugeGraph instance, a user-defined schema or an extraction result. The method `print_result` can be chained to print the result.
+
+```python
+# Import schema from a HugeGraph instance
+import_schema(from_hugegraph="xxx").print_result()
+# Import schema from an extraction result
+import_schema(from_extraction="xxx").print_result()
+# Import schema from user-defined schema
+import_schema(from_user_defined="xxx").print_result()
+```
+
+3. **Extract Triples**: The `extract_triples` method is used to extract triples from a text. The text should be passed as a string argument to the method.
+
+```python
+TEXT = "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010."
+extract_triples(TEXT).print_result()
+```
+
+4. **Disambiguate Word Sense**: The `disambiguate_word_sense` method is used to disambiguate the sense of words in the extracted triples.
+
+```python
+disambiguate_word_sense().print_result()
+```
+
+5. **Commit to HugeGraph**: The `commit_to_hugegraph` method is used to commit the constructed knowledge graph to a HugeGraph instance.
+
+```python
+commit_to_hugegraph().print_result()
+```
+
+6. **Run**: The `run` method is used to execute the chained operations.
+
+```python
+run()
+```
+
+The methods of the `KgBuilder` class can be chained together to perform a sequence of operations.
diff --git a/hugegraph-llm/examples/build_kg_test.py b/hugegraph-llm/examples/build_kg_test.py
@@ -21,7 +21,8 @@
 
 
 if __name__ == "__main__":
-    default_llm = LLMs().get_llm()
+    builder = KgBuilder(LLMs().get_llm())
+
     TEXT = (
         "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with"
         " since 2010. James, in his professional life, works as a journalist. Additionally, Sarah"
@@ -31,47 +32,30 @@
         " their distinctive digital presence through their respective webpages, showcasing their"
         " varied interests and experiences."
     )
-    builder = KgBuilder(default_llm)
 
-    # spo triple extract
-    builder.extract_spo_triple(TEXT).print_result().commit_to_hugegraph(spo=True).run()
-    # build kg with only text
-    builder.extract_nodes_relationships(TEXT).disambiguate_word_sense().commit_to_hugegraph().run()
-    # build kg with text and schemas
-    nodes_schemas = [
-        {
-            "label": "Person",
-            "primary_key": "name",
-            "properties": {
-                "age": "int",
-                "name": "text",
-                "occupation": "text",
-            },
-        },
-        {
-            "label": "Webpage",
-            "primary_key": "name",
-            "properties": {"name": "text", "url": "text"},
-        },
-    ]
-    relationships_schemas = [
-        {
-            "start": "Person",
-            "end": "Person",
-            "type": "roommate",
-            "properties": {"start": "int"},
-        },
-        {
-            "start": "Person",
-            "end": "Webpage",
-            "type": "owns",
-            "properties": {},
-        },
-    ]
+    schema = {
+        "vertices": [
+            {"vertex_label": "person", "properties": ["name", "age", "occupation"]},
+            {"vertex_label": "webpage", "properties": ["name", "url"]},
+        ],
+        "edges": [
+            {
+                "edge_label": "roommate",
+                "source_vertex_label": "person",
+                "target_vertex_label": "person",
+                "properties": {},
+            }
+        ],
+    }
 
     (
-        builder.parse_text_to_data_with_schemas(TEXT, nodes_schemas, relationships_schemas)
-        .disambiguate_data_with_schemas()
-        .commit_data_to_kg()
+        builder.import_schema(from_hugegraph="xxx")
+        .print_result()
+        # .import_schema(from_extraction="xxx").print_result()
+        # .import_schema(from_user_defined=xxx).print_result()
+        .extract_triples(TEXT)
+        .print_result()
+        .disambiguate_word_sense()
+        .commit_to_hugegraph()
         .run()
     )
diff --git a/hugegraph-llm/src/config/config.ini b/hugegraph-llm/src/config/config.ini
@@ -27,6 +27,6 @@ graph = hugegraph
 type = openai
 api_key = xxx
 secret_key = xxx
-ernie_url = https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/eb-instant?access_token=
+ernie_url = https://aip.baidubce.com/rpc/2.0/ai_custom/v1/wenxinworkshop/chat/completions_pro?access_token=
 model_name = gpt-3.5-turbo-16k
 max_token = 4000
diff --git a/hugegraph-llm/src/hugegraph_llm/llms/ernie_bot.py b/hugegraph-llm/src/hugegraph_llm/llms/ernie_bot.py
@@ -54,9 +54,9 @@ def generate(
             messages = [{"role": "user", "content": prompt}]
         url = self.base_url + self.get_access_token()
         # parameter check failed, temperature range is (0, 1.0]
-        payload = json.dumps({"messages": messages, "temperature": 0.00000000001})
+        payload = json.dumps({"messages": messages, "temperature": 0.1})
         headers = {"Content-Type": "application/json"}
-        response = requests.request("POST", url, headers=headers, data=payload, timeout=10)
+        response = requests.request("POST", url, headers=headers, data=payload, timeout=30)
         if response.status_code != 200:
             raise Exception(
                 f"Request failed with code {response.status_code}, message: {response.text}"

diff --git a/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py b/hugegraph-llm/src/hugegraph_llm/operators/common_op/check_schema.py
@@ -0,0 +1,63 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+from typing import Any
+
+
+class CheckSchema:
+    def __init__(self, data):
+        self.result = None
+        self.data = data
+
+    def run(self, schema=None) -> Any:
+        schema = self.data or schema
+        if not isinstance(schema, dict):
+            raise ValueError("Input data is not a dictionary.")
+        if "vertices" not in schema or "edges" not in schema:
+            raise ValueError("Input data does not contain 'vertices' or 'edges'.")
+        if not isinstance(schema["vertices"], list) or not isinstance(schema["edges"], list):
+            raise ValueError("'vertices' or 'edges' in input data is not a list.")
+        for vertex in schema["vertices"]:
+            if not isinstance(vertex, dict):
+                raise ValueError("Vertex in input data is not a dictionary.")
+            if "vertex_label" not in vertex:
+                raise ValueError("Vertex in input data does not contain 'vertex_label'.")
+            if not isinstance(vertex["vertex_label"], str):
+                raise ValueError("'vertex_label' in vertex is not of correct type.")
+        for edge in schema["edges"]:
+            if not isinstance(edge, dict):
+                raise ValueError("Edge in input data is not a dictionary.")
+            if (
+                "edge_label" not in edge
+                or "source_vertex_label" not in edge
+                or "target_vertex_label" not in edge
+            ):
+                raise ValueError(
+                    "Edge in input data does not contain "
+                    "'edge_label', 'source_vertex_label', 'target_vertex_label'."
+                )
+            if (
+                not isinstance(edge["edge_label"], str)
+                or not isinstance(edge["source_vertex_label"], str)
+                or not isinstance(edge["target_vertex_label"], str)
+            ):
+                raise ValueError(
+                    "'edge_label', 'source_vertex_label', 'target_vertex_label' "
+                    "in edge is not of correct type."
+                )
+        return schema