Dump importer_yielder in favor of IMPORTER_REGISTRY

IMPORTER_REGISTRY is neater and does not do any magical string -> object conversion. The registry looks more in sync with improvers. Fixes: aboutcode-org#501 Signed-off-by: Hritik Vijay <[email protected]>
Hritik14 · Jan 26, 2022 · 90fd963 · 90fd963
1 parent 67fc00e
commit 90fd963
Show file tree

Hide file tree

Showing 12 changed files with 76 additions and 513 deletions.
diff --git a/vulnerabilities/admin.py b/vulnerabilities/admin.py
@@ -25,7 +25,6 @@
 
 from vulnerabilities.models import (
     PackageRelatedVulnerability,
-    Importer,
     Package,
     Vulnerability,
     VulnerabilityReference,
@@ -55,11 +54,6 @@ class PackageRelatedVulnerabilityAdmin(admin.ModelAdmin):
     search_fields = ["vulnerability__vulnerability_id", "package__name"]
 
 
-@admin.register(Importer)
-class ImporterAdmin(admin.ModelAdmin):
-    pass
-
-
 @admin.register(VulnerabilitySeverity)
 class VulnerabilitySeverityAdmin(admin.ModelAdmin):
     pass
diff --git a/vulnerabilities/data_inference.py b/vulnerabilities/data_inference.py
@@ -90,6 +90,7 @@ def get_inferences(self, advisory_data: AdvisoryData) -> List[Inference]:
         raise NotImplementedError
 
     @classmethod
+    @property
     def qualified_name(cls):
         """
         Fully qualified name prefixed with the module name of the improver

diff --git a/vulnerabilities/data_source.py b/vulnerabilities/data_source.py
@@ -28,15 +28,13 @@
 import xml.etree.ElementTree as ET
 import datetime
 from pathlib import Path
-from typing import Any
-from typing import ContextManager
-from typing import Iterable
 from typing import List
 from typing import Mapping
 from typing import Optional
 from typing import Set
 from typing import Iterable
 from typing import Tuple
+import warnings
 
 from binaryornot.helpers import is_binary_string
 from git import DiffIndex
@@ -214,104 +212,42 @@ def __post_init__(self):
             logger.warn(f"AdvisoryData with no tzinfo: {self!r}")
 
 
-class InvalidConfigurationError(Exception):
-    pass
-
-
-@dataclasses.dataclass
-class DataSourceConfiguration:
+class NoLicenseWarning(Warning):
     pass
 
 
-class DataSource(ContextManager):
+class DataSource:
     """
-    This class defines how importers consume advisories from a data source.
-
-    It makes a distinction between newly added records since the last run and modified records. This
-    allows the import logic to pick appropriate database operations.
+    A DataSource collects data from various upstreams and returns corresponding
+    AdvisoryData objects in its advisory_data method.
+    Subclass this class to implement an importer
     """
 
-    CONFIG_CLASS = DataSourceConfiguration
+    license = ""
 
-    def __init__(
-        self,
-        last_run_date: Optional[datetime.datetime] = None,
-        cutoff_date: Optional[datetime.datetime] = None,
-        config: Optional[Mapping[str, Any]] = None,
-    ):
-        """
-        Create a DataSource instance.
-
-        :param last_run_date: Optional timestamp when this data source was last inspected
-        :param cutoff_date: Optional timestamp, records older than this will be ignored
-        :param config: Optional dictionary with subclass-specific configuration
-        """
-        config = config or {}
-        try:
-            self.config = self.__class__.CONFIG_CLASS(**config)
-            # These really should be declared in DataSourceConfiguration above but that would
-            # prevent DataSource subclasses from declaring mandatory parameters (i.e. positional
-            # arguments)
-            setattr(self.config, "last_run_date", last_run_date)
-            setattr(self.config, "cutoff_date", cutoff_date)
-        except Exception as e:
-            raise InvalidConfigurationError(str(e))
-
-        self.validate_configuration()
-
-    def __enter__(self):
-        pass
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
+    def __init__(self):
+        if not self.license:
+            warnings.warn(f"Running importer {self!r} without a license", NoLicenseWarning)
 
     @classmethod
+    @property
     def qualified_name(cls):
         """
         Fully qualified name prefixed with the module name of the data source
         used in logging.
         """
         return f"{cls.__module__}.{cls.__qualname__}"
 
-    @property
-    def cutoff_timestamp(self) -> int:
-        """
-        :return: An integer Unix timestamp of the last time this data source was queried or the
-        cutoff date passed in the constructor, whichever is more recent.
-        """
-        if not hasattr(self, "_cutoff_timestamp"):
-            last_run = 0
-            if self.config.last_run_date is not None:
-                last_run = int(self.config.last_run_date.timestamp())
-
-            cutoff = 0
-            if self.config.cutoff_date is not None:
-                cutoff = int(self.config.cutoff_date.timestamp())
-
-            setattr(self, "_cutoff_timestamp", max(last_run, cutoff))
-
-        return self._cutoff_timestamp
-
-    def validate_configuration(self) -> None:
-        """
-        Subclasses can perform more complex validation than what is handled by data classes and
-        their type annotations.
-
-        This method is called in the constructor. It should raise InvalidConfigurationError with a
-        human-readable message.
-        """
-
     def advisory_data(self) -> Iterable[AdvisoryData]:
         """
-        Subclasses return AdvisoryData objects
+        Return AdvisoryData objects corresponding to the data being imported
         """
         raise NotImplementedError
 
-    def error(self, msg: str) -> None:
-        """
-        Helper method for raising InvalidConfigurationError with the class name in the message.
-        """
-        raise InvalidConfigurationError(f"{type(self).__name__}: {msg}")
+
+# TODO: Adopt the same design as that for DataSource
+class DataSourceConfiguration:
+    pass
 
 
 @dataclasses.dataclass

diff --git a/vulnerabilities/import_runner.py b/vulnerabilities/import_runner.py
@@ -32,65 +32,34 @@
 from vulnerabilities import models
 from vulnerabilities.models import Advisory
 from vulnerabilities.data_source import AdvisoryData
+from vulnerabilities.data_source import DataSource
 
 logger = logging.getLogger(__name__)
 
 
 class ImportRunner:
     """
     The ImportRunner is responsible for inserting and updating data about vulnerabilities and
-    affected/unaffected/fixed packages in the database. The two main goals for the implementation
-    are correctness and efficiency.
+    affected/unaffected/fixed packages in the database. The main goal for the implementation
+    is correctness
 
     Correctness:
         - There must be no duplicates in the database (should be enforced by the schema).
         - No valid data from the data source must be skipped or truncated.
-
-    Efficiency:
-        - Bulk inserts should be used whenever possible.
-        - Checking whether a record already exists should be kept to a minimum
-        (the data source should know this instead).
-        - All update and select operations must use indexed columns.
     """
 
-    def __init__(self, importer: models.Importer):
+    def __init__(self, importer: DataSource):
         self.importer = importer
 
-    def run(self, cutoff_date: datetime.datetime = None) -> None:
+    def run(self) -> None:
         """
         Create a data source for the given importer and store the data retrieved in the database.
-
-        cutoff_date - optional timestamp of the oldest data to include in the import
-
-        NB: Data sources provide two kinds of records; vulnerabilities and packages. Vulnerabilities
-        are potentially shared across many packages, from the same data source and from different
-        data sources. For example, a vulnerability in the Linux kernel is mentioned by advisories
-        from all Linux distributions that package this kernel version.
         """
-        logger.info(f"Starting import for {self.importer.name}.")
-        data_source = self.importer.make_data_source(cutoff_date=cutoff_date)
-        with data_source:
-            advisory_data = data_source.advisory_data()
-            importer_name = data_source.qualified_name()
-            process_advisories(advisory_datas=advisory_data, importer_name=importer_name)
-        self.importer.last_run = datetime.datetime.now(tz=datetime.timezone.utc)
-        self.importer.data_source_cfg = dataclasses.asdict(data_source.config)
-        self.importer.save()
-
-        logger.info(f"Finished import for {self.importer.name}.")
-
-
-def vuln_ref_exists(vulnerability, url, reference_id):
-    return models.VulnerabilityReference.objects.filter(
-        vulnerability=vulnerability, reference_id=reference_id, url=url
-    ).exists()
-
-
-def get_vuln_pkg_refs(vulnerability, package):
-    return models.PackageRelatedVulnerability.objects.filter(
-        vulnerability=vulnerability,
-        package=package,
-    )
+        logger.info(f"Starting import for {self.importer.qualified_name}")
+        advisory_datas = self.importer().advisory_data()
+        importer_name = self.importer.qualified_name
+        process_advisories(advisory_datas=advisory_datas, importer_name=importer_name)
+        logger.info(f"Finished import for {self.importer.qualified_name}.")
 
 
 def process_advisories(advisory_datas: Iterable[AdvisoryData], importer_name: str) -> None: