feat: introduce pylint for code analysis (#22)

apache · Oct 23, 2023 · 815dac5 · 815dac5
1 parent 22f02a2
commit 815dac5
Show file tree

Hide file tree

Showing 35 changed files with 1,136 additions and 376 deletions.
diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
@@ -22,6 +22,11 @@ jobs:
       run: |
         python -m pip install --upgrade pip
         pip install pylint
+        pip install -r ./hugegraph-llm/requirements.txt 
+        pip install -r ./hugegraph-python-client/requirements.txt
     - name: Analysing the code with pylint
       run: |
-        pylint $(git ls-files '*.py')
+        export PYTHONPATH=$(pwd)/hugegraph-llm/src:$(pwd)/hugegraph-python-client/src
+        echo ${PYTHONPATH}
+        pylint --rcfile=./pylint.conf hugegraph-llm
+        pylint --rcfile=./pylint.conf hugegraph-python-client
diff --git a/hugegraph-llm/examples/build_kg_test.py b/hugegraph-llm/examples/build_kg_test.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 import os
 from hugegraph_llm.operators.build_kg_operator import KgBuilder
 from hugegraph_llm.llms.openai_llm import OpenAIChat
@@ -22,28 +24,35 @@
     #  If you need a proxy to access OpenAI's API, please set your HTTP proxy here
     os.environ["http_proxy"] = ""
     os.environ["https_proxy"] = ""
-    api_key = ""
+    API_KEY = ""
 
     default_llm = OpenAIChat(
-        api_key=api_key, model_name="gpt-3.5-turbo-16k", max_tokens=4000
+        api_key=API_KEY,
+        model_name="gpt-3.5-turbo-16k",
+        max_tokens=4000,
     )
-    text = (
-        "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, "
-        "in his professional life, works as a journalist. Additionally, Sarah is the proud owner of the website "
-        "www.sarahsplace.com, while James manages his own webpage, though the specific URL is not mentioned here. "
-        "These two individuals, Sarah and James, have not only forged a strong personal bond as roommates but have "
-        "also carved out their distinctive digital presence through their respective webpages, showcasing their "
-        "varied interests and experiences."
+    TEXT = (
+        "Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with"
+        " since 2010. James, in his professional life, works as a journalist. Additionally, Sarah"
+        " is the proud owner of the website www.sarahsplace.com, while James manages his own"
+        " webpage, though the specific URL is not mentioned here. These two individuals, Sarah and"
+        " James, have not only forged a strong personal bond as roommates but have also carved out"
+        " their distinctive digital presence through their respective webpages, showcasing their"
+        " varied interests and experiences."
     )
     builder = KgBuilder(default_llm)
     # build kg with only text
-    builder.parse_text_to_data(text).disambiguate_data().commit_data_to_kg().run()
+    builder.parse_text_to_data(TEXT).disambiguate_data().commit_data_to_kg().run()
     # build kg with text and schemas
     nodes_schemas = [
         {
             "label": "Person",
             "primary_key": "name",
-            "properties": {"age": "int", "name": "text", "occupation": "text"},
+            "properties": {
+                "age": "int",
+                "name": "text",
+                "occupation": "text",
+            },
         },
         {
             "label": "Webpage",
@@ -58,12 +67,15 @@
             "type": "roommate",
             "properties": {"start": "int"},
         },
-        {"start": "Person", "end": "Webpage", "type": "owns", "properties": {}},
+        {
+            "start": "Person",
+            "end": "Webpage",
+            "type": "owns",
+            "properties": {},
+        },
     ]
     (
-        builder.parse_text_to_data_with_schemas(
-            text, nodes_schemas, relationships_schemas
-        )
+        builder.parse_text_to_data_with_schemas(TEXT, nodes_schemas, relationships_schemas)
         .disambiguate_data_with_schemas()
         .commit_data_to_kg()
         .run()

diff --git a/hugegraph-llm/requirements.txt b/hugegraph-llm/requirements.txt
@@ -0,0 +1,3 @@
+openai==0.28.1
+retry==0.9.2
+tiktoken==0.5.1
diff --git a/hugegraph-llm/src/hugegraph_llm/llms/base.py b/hugegraph-llm/src/hugegraph_llm/llms/base.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 from abc import ABC, abstractmethod
 from typing import Any, List, Optional, Callable
 

diff --git a/hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py b/hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 from typing import Callable, List, Optional
 import openai
 import tiktoken
@@ -56,10 +58,10 @@ def generate(
             return str(f"Error: {e}")
         # catch authorization errors / do not retry
         except openai.error.AuthenticationError as e:
-            return "Error: The provided OpenAI API key is invalid"
+            return f"Error: The provided OpenAI API key is invalid, {e}"
         except Exception as e:
             print(f"Retrying LLM call {e}")
-            raise Exception()
+            raise Exception() from e
 
     async def generate_streaming(
         self,
@@ -86,11 +88,11 @@ async def generate_streaming(
             await on_token_callback(message)
         return result
 
-    def num_tokens_from_string(self, string: str) -> int:
+    async def num_tokens_from_string(self, string: str) -> int:
         encoding = tiktoken.encoding_for_model(self.model)
         num_tokens = len(encoding.encode(string))
         return num_tokens
 
-    def max_allowed_token_length(self) -> int:
+    async def max_allowed_token_length(self) -> int:
         # TODO: list all models and their max tokens from api
         return 2049
diff --git a/hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py b/hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 from hugegraph_llm.operators.hugegraph_op.commit_data_to_kg import CommitDataToKg
 from hugegraph_llm.operators.llm_op.disambiguate_data import DisambiguateData
 from hugegraph_llm.operators.llm_op.parse_text_to_data import (
@@ -33,9 +35,7 @@ def parse_text_to_data(self, text: str):
         self.parse_text_to_kg.append(ParseTextToData(llm=self.llm, text=text))
         return self
 
-    def parse_text_to_data_with_schemas(
-        self, text: str, nodes_schemas, relationships_schemas
-    ):
+    def parse_text_to_data_with_schemas(self, text: str, nodes_schemas, relationships_schemas):
         self.parse_text_to_kg.append(
             ParseTextToDataWithSchemas(
                 llm=self.llm,
@@ -47,15 +47,11 @@ def parse_text_to_data_with_schemas(
         return self
 
     def disambiguate_data(self):
-        self.parse_text_to_kg.append(
-            DisambiguateData(llm=self.llm, is_user_schema=False)
-        )
+        self.parse_text_to_kg.append(DisambiguateData(llm=self.llm, is_user_schema=False))
         return self
 
     def disambiguate_data_with_schemas(self):
-        self.parse_text_to_kg.append(
-            DisambiguateData(llm=self.llm, is_user_schema=True)
-        )
+        self.parse_text_to_kg.append(DisambiguateData(llm=self.llm, is_user_schema=True))
         return self
 
     def commit_data_to_kg(self):

diff --git a/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_data_to_kg.py b/hugegraph-llm/src/hugegraph_llm/operators/hugegraph_op/commit_data_to_kg.py
@@ -14,22 +14,23 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 import os
 from pyhugegraph.client import PyHugeClient
 
 
 def generate_new_relationships(nodes_schemas_data, relationships_data):
-    label_id = dict()
+    label_id = {}
     i = 1
     old_label = []
     for item in nodes_schemas_data:
         label = item["label"]
         if label in old_label:
             continue
-        else:
-            label_id[label] = i
-            i += 1
-            old_label.append(label)
+        label_id[label] = i
+        i += 1
+        old_label.append(label)
     new_relationships_data = []
     for relationship in relationships_data:
         start = relationship["start"]
@@ -45,7 +46,7 @@ def generate_new_relationships(nodes_schemas_data, relationships_data):
             for key1, value1 in end.items():
                 if key1 == key:
                     new_end = f"{value}" + ":" + f"{value1}"
-        relationships_data = dict()
+        relationships_data = {}
         relationships_data["start"] = new_start
         relationships_data["end"] = new_end
         relationships_data["type"] = relationships_type
@@ -91,7 +92,7 @@ def generate_schema_nodes(data):
         properties = item["properties"]
         schema_statement = f"schema.vertexLabel('{label}').properties("
         schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
-        schema_statement += f").nullableKeys("
+        schema_statement += ").nullableKeys("
         schema_statement += ", ".join(
             f"'{prop}'" for prop in properties.keys() if prop != primary_key
         )
@@ -109,11 +110,14 @@ def generate_schema_relationships(data):
         end = item["end"]
         schema_relationships_type = item["type"]
         properties = item["properties"]
-        schema_statement = f"schema.edgeLabel('{schema_relationships_type}').sourceLabel('{start}').targetLabel('{end}').properties("
+        schema_statement = (
+            f"schema.edgeLabel('{schema_relationships_type}')"
+            f".sourceLabel('{start}').targetLabel('{end}').properties("
+        )
         schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
-        schema_statement += f").nullableKeys("
+        schema_statement += ").nullableKeys("
         schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
-        schema_statement += f").ifNotExist().create()"
+        schema_statement += ").ifNotExist().create()"
         schema_relationships_statements.append(schema_statement)
     return schema_relationships_statements
 
@@ -153,12 +157,9 @@ def run(self, data: dict):
         relationships = data["relationships"]
         nodes_schemas = data["nodes_schemas"]
         relationships_schemas = data["relationships_schemas"]
-        schema = self.schema
         # properties schema
         schema_nodes_properties = generate_schema_properties(nodes_schemas)
-        schema_relationships_properties = generate_schema_properties(
-            relationships_schemas
-        )
+        schema_relationships_properties = generate_schema_properties(relationships_schemas)
         for schema_nodes_property in schema_nodes_properties:
             exec(schema_nodes_property)
 
@@ -175,7 +176,6 @@ def run(self, data: dict):
         for schema_relationship in schema_relationships:
             exec(schema_relationship)
 
-        g = self.client.graph()
         # nodes
         nodes = generate_nodes(nodes)
         for node in nodes:

diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/disambiguate_data.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 import json
 import re
 from itertools import groupby
@@ -102,7 +104,7 @@ def generate_prompt(data) -> str:
 """
 
 
-internalRegex = r"\[(.*?)\]"
+INTERNAL_REGEX = r"\[(.*?)\]"
 
 
 class DisambiguateData:
@@ -144,7 +146,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
                 {"role": "user", "content": generate_prompt(dis_string)},
             ]
             raw_nodes = self.llm.generate(messages)
-            n = re.findall(internalRegex, raw_nodes)
+            n = re.findall(INTERNAL_REGEX, raw_nodes)
             new_nodes.extend(nodes_text_to_list_of_dict(n))
 
         relationship_data = ""
@@ -172,7 +174,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
             {"role": "user", "content": generate_prompt(relationship_data)},
         ]
         raw_relationships = self.llm.generate(messages)
-        rels = re.findall(internalRegex, raw_relationships)
+        rels = re.findall(INTERNAL_REGEX, raw_relationships)
         new_relationships.extend(relationships_text_to_list_of_dict(rels))
 
         if not self.is_user_schema:
@@ -193,7 +195,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
                 {"role": "user", "content": generate_prompt(nodes_schemas_data)},
             ]
             raw_nodes_schemas = self.llm.generate(messages)
-            n = re.findall(internalRegex, raw_nodes_schemas)
+            n = re.findall(INTERNAL_REGEX, raw_nodes_schemas)
             new_nodes_schemas.extend(nodes_schemas_text_to_list_of_dict(n))
 
             relationships_schemas_data = ""
@@ -210,12 +212,8 @@ def run(self, data: dict) -> dict[str, list[any]]:
                     + "]\n"
                 )
 
-            node_schemas_labels = [
-                nodes_schemas["label"] for nodes_schemas in new_nodes_schemas
-            ]
-            relationships_schemas_data += "Valid Labels:\n" + "\n".join(
-                node_schemas_labels
-            )
+            node_schemas_labels = [nodes_schemas["label"] for nodes_schemas in new_nodes_schemas]
+            relationships_schemas_data += "Valid Labels:\n" + "\n".join(node_schemas_labels)
 
             messages = [
                 {
@@ -228,7 +226,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
                 },
             ]
             raw_relationships_schemas = self.llm.generate(messages)
-            schemas_rels = re.findall(internalRegex, raw_relationships_schemas)
+            schemas_rels = re.findall(INTERNAL_REGEX, raw_relationships_schemas)
             new_relationships_schemas.extend(
                 relationships_schemas_text_to_list_of_dict(schemas_rels)
             )

diff --git a/hugegraph-llm/src/hugegraph_llm/operators/llm_op/parse_text_to_data.py b/hugegraph-llm/src/hugegraph_llm/operators/llm_op/parse_text_to_data.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+
 import re
 from typing import List
 
@@ -89,8 +91,7 @@ def split_string_to_fit_token_space(
     current_chunk = ""
     for chunk in chunked_data:
         if (
-            llm.num_tokens_from_string(current_chunk)
-            + llm.num_tokens_from_string(chunk)
+            llm.num_tokens_from_string(current_chunk) + llm.num_tokens_from_string(chunk)
             < allowed_tokens
         ):
             current_chunk += chunk
@@ -123,10 +124,8 @@ def get_nodes_and_relationships_from_result(result):
         nodes.extend(re.findall(internal_regex, raw_nodes))
         relationships.extend(re.findall(internal_regex, raw_relationships))
         nodes_schemas.extend(re.findall(internal_regex, raw_nodes_schemas))
-        relationships_schemas.extend(
-            re.findall(internal_regex, raw_relationships_schemas)
-        )
-    result = dict()
+        relationships_schemas.extend(re.findall(internal_regex, raw_relationships_schemas))
+    result = {}
     result["nodes"] = []
     result["relationships"] = []
     result["nodes_schemas"] = []
@@ -159,9 +158,7 @@ def process(self, chunk):
     def run(self, data: dict) -> dict[str, list[any]]:
         system_message = generate_system_message()
         prompt_string = generate_prompt("")
-        token_usage_per_prompt = self.llm.num_tokens_from_string(
-            system_message + prompt_string
-        )
+        token_usage_per_prompt = self.llm.num_tokens_from_string(system_message + prompt_string)
         chunked_data = split_string_to_fit_token_space(
             llm=self.llm, string=self.text, token_use_per_string=token_usage_per_prompt
         )
@@ -178,9 +175,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
 class ParseTextToDataWithSchemas:
     llm: BaseLLM
 
-    def __init__(
-        self, llm: BaseLLM, text: str, nodes_schema, relationships_schemas
-    ) -> None:
+    def __init__(self, llm: BaseLLM, text: str, nodes_schema, relationships_schemas) -> None:
         self.llm = llm
         self.text = text
         self.data = {}
@@ -204,9 +199,7 @@ def process_with_schemas(self, chunk):
     def run(self) -> dict[str, list[any]]:
         system_message = generate_system_message_with_schemas()
         prompt_string = generate_prompt_with_schemas("", "", "")
-        token_usage_per_prompt = self.llm.num_tokens_from_string(
-            system_message + prompt_string
-        )
+        token_usage_per_prompt = self.llm.num_tokens_from_string(system_message + prompt_string)
         chunked_data = split_string_to_fit_token_space(
             llm=self.llm, string=self.text, token_use_per_string=token_usage_per_prompt
         )