Skip to content

Commit

Permalink
feat: introduce pylint for code analysis (#22)
Browse files Browse the repository at this point in the history
  • Loading branch information
simon824 authored Oct 23, 2023
1 parent 22f02a2 commit 815dac5
Show file tree
Hide file tree
Showing 35 changed files with 1,136 additions and 376 deletions.
7 changes: 6 additions & 1 deletion .github/workflows/pylint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,11 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install pylint
pip install -r ./hugegraph-llm/requirements.txt
pip install -r ./hugegraph-python-client/requirements.txt
- name: Analysing the code with pylint
run: |
pylint $(git ls-files '*.py')
export PYTHONPATH=$(pwd)/hugegraph-llm/src:$(pwd)/hugegraph-python-client/src
echo ${PYTHONPATH}
pylint --rcfile=./pylint.conf hugegraph-llm
pylint --rcfile=./pylint.conf hugegraph-python-client
42 changes: 27 additions & 15 deletions hugegraph-llm/examples/build_kg_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import os
from hugegraph_llm.operators.build_kg_operator import KgBuilder
from hugegraph_llm.llms.openai_llm import OpenAIChat
Expand All @@ -22,28 +24,35 @@
# If you need a proxy to access OpenAI's API, please set your HTTP proxy here
os.environ["http_proxy"] = ""
os.environ["https_proxy"] = ""
api_key = ""
API_KEY = ""

default_llm = OpenAIChat(
api_key=api_key, model_name="gpt-3.5-turbo-16k", max_tokens=4000
api_key=API_KEY,
model_name="gpt-3.5-turbo-16k",
max_tokens=4000,
)
text = (
"Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with since 2010. James, "
"in his professional life, works as a journalist. Additionally, Sarah is the proud owner of the website "
"www.sarahsplace.com, while James manages his own webpage, though the specific URL is not mentioned here. "
"These two individuals, Sarah and James, have not only forged a strong personal bond as roommates but have "
"also carved out their distinctive digital presence through their respective webpages, showcasing their "
"varied interests and experiences."
TEXT = (
"Meet Sarah, a 30-year-old attorney, and her roommate, James, whom she's shared a home with"
" since 2010. James, in his professional life, works as a journalist. Additionally, Sarah"
" is the proud owner of the website www.sarahsplace.com, while James manages his own"
" webpage, though the specific URL is not mentioned here. These two individuals, Sarah and"
" James, have not only forged a strong personal bond as roommates but have also carved out"
" their distinctive digital presence through their respective webpages, showcasing their"
" varied interests and experiences."
)
builder = KgBuilder(default_llm)
# build kg with only text
builder.parse_text_to_data(text).disambiguate_data().commit_data_to_kg().run()
builder.parse_text_to_data(TEXT).disambiguate_data().commit_data_to_kg().run()
# build kg with text and schemas
nodes_schemas = [
{
"label": "Person",
"primary_key": "name",
"properties": {"age": "int", "name": "text", "occupation": "text"},
"properties": {
"age": "int",
"name": "text",
"occupation": "text",
},
},
{
"label": "Webpage",
Expand All @@ -58,12 +67,15 @@
"type": "roommate",
"properties": {"start": "int"},
},
{"start": "Person", "end": "Webpage", "type": "owns", "properties": {}},
{
"start": "Person",
"end": "Webpage",
"type": "owns",
"properties": {},
},
]
(
builder.parse_text_to_data_with_schemas(
text, nodes_schemas, relationships_schemas
)
builder.parse_text_to_data_with_schemas(TEXT, nodes_schemas, relationships_schemas)
.disambiguate_data_with_schemas()
.commit_data_to_kg()
.run()
Expand Down
3 changes: 3 additions & 0 deletions hugegraph-llm/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
openai==0.28.1
retry==0.9.2
tiktoken==0.5.1
2 changes: 2 additions & 0 deletions hugegraph-llm/src/hugegraph_llm/llms/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from abc import ABC, abstractmethod
from typing import Any, List, Optional, Callable

Expand Down
10 changes: 6 additions & 4 deletions hugegraph-llm/src/hugegraph_llm/llms/openai_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from typing import Callable, List, Optional
import openai
import tiktoken
Expand Down Expand Up @@ -56,10 +58,10 @@ def generate(
return str(f"Error: {e}")
# catch authorization errors / do not retry
except openai.error.AuthenticationError as e:
return "Error: The provided OpenAI API key is invalid"
return f"Error: The provided OpenAI API key is invalid, {e}"
except Exception as e:
print(f"Retrying LLM call {e}")
raise Exception()
raise Exception() from e

async def generate_streaming(
self,
Expand All @@ -86,11 +88,11 @@ async def generate_streaming(
await on_token_callback(message)
return result

def num_tokens_from_string(self, string: str) -> int:
async def num_tokens_from_string(self, string: str) -> int:
encoding = tiktoken.encoding_for_model(self.model)
num_tokens = len(encoding.encode(string))
return num_tokens

def max_allowed_token_length(self) -> int:
async def max_allowed_token_length(self) -> int:
# TODO: list all models and their max tokens from api
return 2049
14 changes: 5 additions & 9 deletions hugegraph-llm/src/hugegraph_llm/operators/build_kg_operator.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


from hugegraph_llm.operators.hugegraph_op.commit_data_to_kg import CommitDataToKg
from hugegraph_llm.operators.llm_op.disambiguate_data import DisambiguateData
from hugegraph_llm.operators.llm_op.parse_text_to_data import (
Expand All @@ -33,9 +35,7 @@ def parse_text_to_data(self, text: str):
self.parse_text_to_kg.append(ParseTextToData(llm=self.llm, text=text))
return self

def parse_text_to_data_with_schemas(
self, text: str, nodes_schemas, relationships_schemas
):
def parse_text_to_data_with_schemas(self, text: str, nodes_schemas, relationships_schemas):
self.parse_text_to_kg.append(
ParseTextToDataWithSchemas(
llm=self.llm,
Expand All @@ -47,15 +47,11 @@ def parse_text_to_data_with_schemas(
return self

def disambiguate_data(self):
self.parse_text_to_kg.append(
DisambiguateData(llm=self.llm, is_user_schema=False)
)
self.parse_text_to_kg.append(DisambiguateData(llm=self.llm, is_user_schema=False))
return self

def disambiguate_data_with_schemas(self):
self.parse_text_to_kg.append(
DisambiguateData(llm=self.llm, is_user_schema=True)
)
self.parse_text_to_kg.append(DisambiguateData(llm=self.llm, is_user_schema=True))
return self

def commit_data_to_kg(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,22 +14,23 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import os
from pyhugegraph.client import PyHugeClient


def generate_new_relationships(nodes_schemas_data, relationships_data):
label_id = dict()
label_id = {}
i = 1
old_label = []
for item in nodes_schemas_data:
label = item["label"]
if label in old_label:
continue
else:
label_id[label] = i
i += 1
old_label.append(label)
label_id[label] = i
i += 1
old_label.append(label)
new_relationships_data = []
for relationship in relationships_data:
start = relationship["start"]
Expand All @@ -45,7 +46,7 @@ def generate_new_relationships(nodes_schemas_data, relationships_data):
for key1, value1 in end.items():
if key1 == key:
new_end = f"{value}" + ":" + f"{value1}"
relationships_data = dict()
relationships_data = {}
relationships_data["start"] = new_start
relationships_data["end"] = new_end
relationships_data["type"] = relationships_type
Expand Down Expand Up @@ -91,7 +92,7 @@ def generate_schema_nodes(data):
properties = item["properties"]
schema_statement = f"schema.vertexLabel('{label}').properties("
schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
schema_statement += f").nullableKeys("
schema_statement += ").nullableKeys("
schema_statement += ", ".join(
f"'{prop}'" for prop in properties.keys() if prop != primary_key
)
Expand All @@ -109,11 +110,14 @@ def generate_schema_relationships(data):
end = item["end"]
schema_relationships_type = item["type"]
properties = item["properties"]
schema_statement = f"schema.edgeLabel('{schema_relationships_type}').sourceLabel('{start}').targetLabel('{end}').properties("
schema_statement = (
f"schema.edgeLabel('{schema_relationships_type}')"
f".sourceLabel('{start}').targetLabel('{end}').properties("
)
schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
schema_statement += f").nullableKeys("
schema_statement += ").nullableKeys("
schema_statement += ", ".join(f"'{prop}'" for prop in properties.keys())
schema_statement += f").ifNotExist().create()"
schema_statement += ").ifNotExist().create()"
schema_relationships_statements.append(schema_statement)
return schema_relationships_statements

Expand Down Expand Up @@ -153,12 +157,9 @@ def run(self, data: dict):
relationships = data["relationships"]
nodes_schemas = data["nodes_schemas"]
relationships_schemas = data["relationships_schemas"]
schema = self.schema
# properties schema
schema_nodes_properties = generate_schema_properties(nodes_schemas)
schema_relationships_properties = generate_schema_properties(
relationships_schemas
)
schema_relationships_properties = generate_schema_properties(relationships_schemas)
for schema_nodes_property in schema_nodes_properties:
exec(schema_nodes_property)

Expand All @@ -175,7 +176,6 @@ def run(self, data: dict):
for schema_relationship in schema_relationships:
exec(schema_relationship)

g = self.client.graph()
# nodes
nodes = generate_nodes(nodes)
for node in nodes:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import json
import re
from itertools import groupby
Expand Down Expand Up @@ -102,7 +104,7 @@ def generate_prompt(data) -> str:
"""


internalRegex = r"\[(.*?)\]"
INTERNAL_REGEX = r"\[(.*?)\]"


class DisambiguateData:
Expand Down Expand Up @@ -144,7 +146,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
{"role": "user", "content": generate_prompt(dis_string)},
]
raw_nodes = self.llm.generate(messages)
n = re.findall(internalRegex, raw_nodes)
n = re.findall(INTERNAL_REGEX, raw_nodes)
new_nodes.extend(nodes_text_to_list_of_dict(n))

relationship_data = ""
Expand Down Expand Up @@ -172,7 +174,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
{"role": "user", "content": generate_prompt(relationship_data)},
]
raw_relationships = self.llm.generate(messages)
rels = re.findall(internalRegex, raw_relationships)
rels = re.findall(INTERNAL_REGEX, raw_relationships)
new_relationships.extend(relationships_text_to_list_of_dict(rels))

if not self.is_user_schema:
Expand All @@ -193,7 +195,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
{"role": "user", "content": generate_prompt(nodes_schemas_data)},
]
raw_nodes_schemas = self.llm.generate(messages)
n = re.findall(internalRegex, raw_nodes_schemas)
n = re.findall(INTERNAL_REGEX, raw_nodes_schemas)
new_nodes_schemas.extend(nodes_schemas_text_to_list_of_dict(n))

relationships_schemas_data = ""
Expand All @@ -210,12 +212,8 @@ def run(self, data: dict) -> dict[str, list[any]]:
+ "]\n"
)

node_schemas_labels = [
nodes_schemas["label"] for nodes_schemas in new_nodes_schemas
]
relationships_schemas_data += "Valid Labels:\n" + "\n".join(
node_schemas_labels
)
node_schemas_labels = [nodes_schemas["label"] for nodes_schemas in new_nodes_schemas]
relationships_schemas_data += "Valid Labels:\n" + "\n".join(node_schemas_labels)

messages = [
{
Expand All @@ -228,7 +226,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
},
]
raw_relationships_schemas = self.llm.generate(messages)
schemas_rels = re.findall(internalRegex, raw_relationships_schemas)
schemas_rels = re.findall(INTERNAL_REGEX, raw_relationships_schemas)
new_relationships_schemas.extend(
relationships_schemas_text_to_list_of_dict(schemas_rels)
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.


import re
from typing import List

Expand Down Expand Up @@ -89,8 +91,7 @@ def split_string_to_fit_token_space(
current_chunk = ""
for chunk in chunked_data:
if (
llm.num_tokens_from_string(current_chunk)
+ llm.num_tokens_from_string(chunk)
llm.num_tokens_from_string(current_chunk) + llm.num_tokens_from_string(chunk)
< allowed_tokens
):
current_chunk += chunk
Expand Down Expand Up @@ -123,10 +124,8 @@ def get_nodes_and_relationships_from_result(result):
nodes.extend(re.findall(internal_regex, raw_nodes))
relationships.extend(re.findall(internal_regex, raw_relationships))
nodes_schemas.extend(re.findall(internal_regex, raw_nodes_schemas))
relationships_schemas.extend(
re.findall(internal_regex, raw_relationships_schemas)
)
result = dict()
relationships_schemas.extend(re.findall(internal_regex, raw_relationships_schemas))
result = {}
result["nodes"] = []
result["relationships"] = []
result["nodes_schemas"] = []
Expand Down Expand Up @@ -159,9 +158,7 @@ def process(self, chunk):
def run(self, data: dict) -> dict[str, list[any]]:
system_message = generate_system_message()
prompt_string = generate_prompt("")
token_usage_per_prompt = self.llm.num_tokens_from_string(
system_message + prompt_string
)
token_usage_per_prompt = self.llm.num_tokens_from_string(system_message + prompt_string)
chunked_data = split_string_to_fit_token_space(
llm=self.llm, string=self.text, token_use_per_string=token_usage_per_prompt
)
Expand All @@ -178,9 +175,7 @@ def run(self, data: dict) -> dict[str, list[any]]:
class ParseTextToDataWithSchemas:
llm: BaseLLM

def __init__(
self, llm: BaseLLM, text: str, nodes_schema, relationships_schemas
) -> None:
def __init__(self, llm: BaseLLM, text: str, nodes_schema, relationships_schemas) -> None:
self.llm = llm
self.text = text
self.data = {}
Expand All @@ -204,9 +199,7 @@ def process_with_schemas(self, chunk):
def run(self) -> dict[str, list[any]]:
system_message = generate_system_message_with_schemas()
prompt_string = generate_prompt_with_schemas("", "", "")
token_usage_per_prompt = self.llm.num_tokens_from_string(
system_message + prompt_string
)
token_usage_per_prompt = self.llm.num_tokens_from_string(system_message + prompt_string)
chunked_data = split_string_to_fit_token_space(
llm=self.llm, string=self.text, token_use_per_string=token_usage_per_prompt
)
Expand Down
Loading

0 comments on commit 815dac5

Please sign in to comment.