Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Transform and load dependencies from setup.cfg #718

Merged
merged 8 commits into from
Nov 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
165 changes: 119 additions & 46 deletions cartography/intel/github/repos.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import configparser
import logging
from string import Template
from typing import Any
Expand All @@ -14,7 +15,6 @@
from cartography.util import run_cleanup_job
from cartography.util import timeit


logger = logging.getLogger(__name__)

GITHUB_ORG_REPOS_PAGINATED_GRAPHQL = """
Expand Down Expand Up @@ -76,6 +76,11 @@
text
}
}
setupCfg:object(expression: "HEAD:setup.cfg") {
... on Blob {
text
}
}
}
}
}
Expand Down Expand Up @@ -121,7 +126,8 @@ def transform(repos_json: List[Dict]) -> Dict:
_transform_repo_objects(repo_object, transformed_repo_list)
_transform_repo_owners(repo_object['owner']['url'], repo_object, transformed_repo_owners)
_transform_collaborators(repo_object['collaborators'], repo_object['url'], transformed_collaborators)
_transform_python_requirements(repo_object['requirements'], repo_object['url'], transformed_requirements_files)
_transform_requirements_txt(repo_object['requirements'], repo_object['url'], transformed_requirements_files)
_transform_setup_cfg_requirements(repo_object['setupCfg'], repo_object['url'], transformed_requirements_files)
Comment on lines +129 to +130
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if a dep is listed in both places (e.g. no bounds in setup.cfg but pinned in requirements.txt)?
What do we want to have happen (e.g. what would be most useful for our query patterns/the https://github.com/lyft/cartography/blob/master/docs/usage/samplequeries.md sample queries)?
This is likely worth a test case.

Copy link
Contributor Author

@olivia-hong olivia-hong Nov 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it's listed in both places and has different specifiers for each usage, cartography will create two separate nodes, which I think makes sense rather than any sort of "merging" logic. This allows users to query what version(s) are being used or perhaps find out that they are specifying a dependency in multiple files when it's not needed. Added a test case for this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That SGTM, thanks! Just wanted to check that we didn't have one "overwrite" the other

results = {
'repos': transformed_repo_list,
'repo_languages': transformed_repo_languages,
Expand Down Expand Up @@ -235,58 +241,125 @@ def _transform_collaborators(collaborators: Dict, repo_url: str, transformed_col
transformed_collaborators[user_permission].append(user)


def _transform_python_requirements(req_file_contents: Dict, repo_url: str, out_requirements_files: List[Dict]) -> None:
def _transform_requirements_txt(
req_file_contents: Optional[Dict],
repo_url: str,
out_requirements_files: List[Dict],
) -> None:
"""
Performs data transformations for the requirements.txt files in a GitHub repo, if available.
:param req_file_contents: str: The text contents of the requirements file.
Performs data transformations for the requirements.txt file in a GitHub repo, if available.
:param req_file_contents: Dict: The contents of the requirements.txt file.
Comment on lines +250 to +251
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

gotta love the random cleanup 🙂 , very "leave it better than you found it"

:param repo_url: str: The URL of the GitHub repo.
:param out_requirements_files: Output array to append transformed results to.
:return: Nothing.
"""
if req_file_contents and req_file_contents.get('text'):
text_contents = req_file_contents['text']
requirements_list = text_contents.split("\n")
_transform_python_requirements(requirements_list, repo_url, out_requirements_files)

parsed_list = []
for line in text_contents.split("\n"):
# Remove trailing comments and extra whitespace
stripped_line = line.partition('#')[0].strip()
if stripped_line == '':
continue
try:
req = Requirement(stripped_line)
except InvalidRequirement:
# INFO and not WARN/ERROR as we intentionally don't support all ways to specify Python requirements
logger.info(
f"Failed to parse line \"{line}\" in repo {repo_url}'s requirements.txt; skipping line.",
exc_info=True,
)
continue
parsed_list.append(req)

for req in parsed_list:
pinned_version = None
if len(req.specifier) == 1:
specifier = next(iter(req.specifier))
if specifier.operator == '==':
pinned_version = specifier.version

# Set `spec` to a default value. Example values for str(req.specifier): "<4.0,>=3.0" or "==1.0.0".
spec: Optional[str] = str(req.specifier)
# Set spec to `None` instead of empty string so that the Neo4j driver will leave the library.specifier field
# undefined. As convention, we prefer undefined values over empty strings in the graph.
if spec == '':
spec = None

canon_name = canonicalize_name(req.name)
requirement_id = f"{canon_name}|{pinned_version}" if pinned_version else canon_name

out_requirements_files.append({
"id": requirement_id,
"name": canon_name,
"specifier": spec,
"version": pinned_version,
"repo_url": repo_url,
})

def _transform_setup_cfg_requirements(
setup_cfg_contents: Optional[Dict],
repo_url: str,
out_requirements_files: List[Dict],
) -> None:
"""
Performs data transformations for the setup.cfg file in a GitHub repo, if available.
:param setup_cfg_contents: Dict: Contains contents of a repo's setup.cfg file.
:param repo_url: str: The URL of the GitHub repo.
:param out_requirements_files: Output array to append transformed results to.
:return: Nothing.
"""
if not setup_cfg_contents or not setup_cfg_contents.get('text'):
return
text_contents = setup_cfg_contents['text']
setup_cfg = configparser.ConfigParser()
try:
setup_cfg.read_string(text_contents)
except configparser.Error:
logger.info(
f"Failed to parse {repo_url}'s setup.cfg; skipping.",
exc_info=True,
)
return
requirements_list = parse_setup_cfg(setup_cfg)
_transform_python_requirements(requirements_list, repo_url, out_requirements_files)


def _transform_python_requirements(
requirements_list: List[str],
repo_url: str,
out_requirements_files: List[Dict],
) -> None:
"""
Helper function to perform data transformations on an arbitrary list of requirements.
:param requirements_list: List[str]: List of requirements
:param repo_url: str: The URL of the GitHub repo.
:param out_requirements_files: Output array to append transformed results to.
:return: Nothing.
"""
parsed_list = []
for line in requirements_list:
stripped_line = line.partition('#')[0].strip()
if stripped_line == '':
continue
try:
req = Requirement(stripped_line)
except InvalidRequirement:
# INFO and not WARN/ERROR as we intentionally don't support all ways to specify Python requirements
logger.info(
f"Failed to parse line \"{line}\" in repo {repo_url}'s requirements.txt; skipping line.",
exc_info=True,
)
continue
parsed_list.append(req)

for req in parsed_list:
pinned_version = None
if len(req.specifier) == 1:
specifier = next(iter(req.specifier))
if specifier.operator == '==':
pinned_version = specifier.version

# Set `spec` to a default value. Example values for str(req.specifier): "<4.0,>=3.0" or "==1.0.0".
spec: Optional[str] = str(req.specifier)
# Set spec to `None` instead of empty string so that the Neo4j driver will leave the library.specifier field
# undefined. As convention, we prefer undefined values over empty strings in the graph.
if spec == '':
spec = None

canon_name = canonicalize_name(req.name)
requirement_id = f"{canon_name}|{pinned_version}" if pinned_version else canon_name

out_requirements_files.append({
"id": requirement_id,
"name": canon_name,
"specifier": spec,
"version": pinned_version,
"repo_url": repo_url,
})


def parse_setup_cfg(config: configparser.ConfigParser) -> List[str]:
reqs: List[str] = []
reqs.extend(_parse_setup_cfg_requirements(config.get("options", "install_requires", fallback="")))
reqs.extend(_parse_setup_cfg_requirements(config.get("options", "setup_requires", fallback="")))
if config.has_section("options.extras_require"):
for _, val in config.items("options.extras_require"):
reqs.extend(_parse_setup_cfg_requirements(val))
return reqs


# logic taken from setuptools:
# https://github.com/pypa/setuptools/blob/f359b8a7608c7f118710af02cb5edab4e6abb942/setuptools/config.py#L241-L258
def _parse_setup_cfg_requirements(reqs: str, separator: str = ";") -> List[str]:
if "\n" in reqs:
reqs_list = reqs.splitlines()
else:
reqs_list = reqs.split(separator)

return [req.strip() for req in reqs_list if req.strip()]


@timeit
Expand Down
4 changes: 3 additions & 1 deletion docs/schema/github.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,9 @@ Representation of a single Programming Language [language object](https://develo

## Dependency::PythonLibrary

Representation of a Python library as listed in a [requirements.txt](https://pip.pypa.io/en/stable/user_guide/#requirements-files) file.
Representation of a Python library as listed in a [requirements.txt](https://pip.pypa.io/en/stable/user_guide/#requirements-files)
or [setup.cfg](https://setuptools.pypa.io/en/latest/userguide/declarative_config.html) file.
Within a setup.cfg file, cartography will load everything from `install_requires`, `setup_requires`, and `extras_require`.

| Field | Description |
|-------|-------------|
Expand Down
22 changes: 21 additions & 1 deletion tests/data/github/repos.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import textwrap

GET_REPOS = [
{
'name': 'sample_repo',
Expand Down Expand Up @@ -32,6 +34,14 @@
},
'collaborators': {'edges': [], 'nodes': []},
'requirements': {'text': 'cartography\nhttplib2<0.7.0\njinja2\nlxml\n-e git+https://example.com#egg=foobar\nhttps://example.com/foobar.tar.gz\npip @ https://github.com/pypa/pip/archive/1.3.1.zip#sha1=da9234ee9982d4bbb3c72346a6de940a148ea686\n'}, # noqa
'setupCfg': {
'text': textwrap.dedent('''
[options]
install_requires =
neo4j
scipy!=1.20.0 # comment
'''),
},
}, {
'name': 'SampleRepo2',
'nameWithOwner': 'example_org/SampleRepo2',
Expand Down Expand Up @@ -64,6 +74,7 @@
},
'collaborators': None,
'requirements': None,
'setupCfg': None,
},
{
'name': 'cartography',
Expand Down Expand Up @@ -139,7 +150,16 @@
],
},
'requirements': {
'text': 'cartography==0.1.0\nhttplib2>=0.7.0\njinja2\nlxml\n# This is a comment line to be ignored\n',
'text': 'cartography==0.1.0\nhttplib2>=0.7.0\njinja2\nlxml\n# This is a comment line to be ignored\nokta==0.9.0', # noqa
},
'setupCfg': {
'text': textwrap.dedent('''
[options]
install_requires =
neo4j>=1.0.0
numpy!=1.20.0 # comment
okta
'''),
},
},
]
40 changes: 38 additions & 2 deletions tests/integration/cartography/intel/github/test_repos.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_repository_to_collaborators(neo4j_session):

def test_pinned_python_library_to_repo(neo4j_session):
"""
Ensure that repositories are connected to pinned Python libraries.
Ensure that repositories are connected to pinned Python libraries stated as dependencies in requirements.txt.
Create the path (:RepoA)-[:REQUIRES{specifier:"0.1.0"}]->(:PythonLibrary{'Cartography'})<-[:REQUIRES]-(:RepoB),
and verify that exactly 1 repo is connected to the PythonLibrary with a specifier (RepoA).
"""
Expand All @@ -210,7 +210,7 @@ def test_pinned_python_library_to_repo(neo4j_session):

def test_upinned_python_library_to_repo(neo4j_session):
"""
Ensure that repositories are connected to un-pinned Python libraries.
Ensure that repositories are connected to un-pinned Python libraries stated as dependencies in requirements.txt.
That is, create the path
(:RepoA)-[r:REQUIRES{specifier:"0.1.0"}]->(:PythonLibrary{'Cartography'})<-[:REQUIRES]-(:RepoB),
and verify that exactly 1 repo is connected to the PythonLibrary without using a pinned specifier (RepoB).
Expand All @@ -227,3 +227,39 @@ def test_upinned_python_library_to_repo(neo4j_session):
actual_nodes = {n['repo_count'] for n in nodes}
expected_nodes = {1}
assert actual_nodes == expected_nodes


def test_setup_cfg_library_to_repo(neo4j_session):
"""
Ensure that repositories are connected to Python libraries stated as dependencies in setup.cfg.
and verify that exactly 2 repos are connected to the PythonLibrary.
"""
_ensure_local_neo4j_has_test_data(neo4j_session)

# Note: don't query for relationship attributes in code that needs to be fast.
query = """
MATCH (repo:GitHubRepository)-[r:REQUIRES]->(lib:PythonLibrary{id:'neo4j'})
RETURN count(repo) as repo_count
"""
nodes = neo4j_session.run(query)
actual_nodes = {n['repo_count'] for n in nodes}
expected_nodes = {2}
assert actual_nodes == expected_nodes


def test_python_library_in_multiple_requirements_files(neo4j_session):
"""
Ensure that repositories are connected to Python libraries stated as dependencies in
both setup.cfg and requirements.txt. Ensures that if the dependency has different
specifiers in each file, a separate node is created for each.
"""
_ensure_local_neo4j_has_test_data(neo4j_session)

query = """
MATCH (repo:GitHubRepository)-[r:REQUIRES]->(lib:PythonLibrary{name:'okta'})
RETURN lib.id as lib_ids
"""
nodes = neo4j_session.run(query)
node_ids = {n['lib_ids'] for n in nodes}
assert len(node_ids) == 2
assert node_ids == {'okta', 'okta|0.9.0'}