beyarkay · keeganwhite · Sep 7, 2023 · Sep 7, 2023 · Sep 8, 2023 · Sep 8, 2023
diff --git a/requirements.txt b/requirements.txt
@@ -11,6 +11,8 @@ cssselect2==0.7.0
 defusedxml==0.7.1
 distro==1.7.0
 et-xmlfile==1.1.0
+gitdb==4.0.10
+GitPython==3.1.34
 html5lib==1.1
 ics==0.7.2
 idna==3.3
@@ -29,12 +31,14 @@ PyYAML==6.0
 reportlab==3.6.11
 requests==2.28.1
 six==1.16.0
+smmap==5.0.0
 soupsieve==2.3.2.post1
 svglib==1.4.1
 tabula-py==2.4.0
 TatSu==5.8.3
 tinycss2==1.2.1
 tomli==2.0.1
+typing_extensions==4.7.1
 urllib3==1.26.9
 webencodings==0.5.1
 yamllint==1.27.1
diff --git a/src/aggregate_historical_data.py b/src/aggregate_historical_data.py
@@ -0,0 +1,147 @@
+import datetime
+import os
+import yaml
+from dateutil.parser import parse
+import logging
+
+logging.basicConfig(level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s',
+                    filename='aggregate_historical_data.log',
+                    filemode='a')
+logger = logging.getLogger()
+NEW_DIRECTORY = "historical_versions"
+AGGREGATED_FILE = "historical_data.yml"
+
+
+def find_overlaps(sorted_data):
+    overlaps = []
+    for i in range(len(sorted_data)):
+        for j in range(i + 1, len(sorted_data)):
+            if sorted_data[i]['finsh'] > sorted_data[j]['start']:
+                overlaps.append((sorted_data[i], sorted_data[j]))
+                if sorted_data[j]['start'] > sorted_data[i]['finsh']:
+                    break
+    return overlaps
+
+
+def find_erroneous_line(file):
+    with open(file, 'r') as f:
+        for i, line in enumerate(f, start=1):
+            try:
+                yaml.safe_load(line)
+            except Exception as inner_e:
+                print(f"Error on line {i}: {inner_e}")
+                print(f"Line content: {line.strip()}")
+
+
+def load_yaml_files(directory):
+    num_file_import_errors = 0
+    files = sorted(
+        [os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.yml') or f.endswith(".yaml")])
+    loadshedding_data = []
+    for file in files:
+        try:
+            with open(file, 'r') as f:
+                date_time_str = file.split("_")[-1]
+                date_time_str = date_time_str.split(".")[0]
+                data = yaml.safe_load(f)
+                for change in data["changes"]:
+                    change['commit_time'] = datetime.datetime.fromisoformat(date_time_str)
+                loadshedding_data.append(data)
+        except Exception as e:
+            num_file_import_errors += 1
+            logger.exception(f"Error in file {file}")
+            continue
+    print(f"There were {num_file_import_errors} files that could not be imported. Check aggregate_historical_data.log"
+          f" for details")
+    return loadshedding_data
+
+
+def resolve_conflicts(sorted_data):
+    resolved_data = []
+
+    i = 0
+    while i < len(sorted_data):
+        current_entry = sorted_data[i].copy()
+
+        j = i + 1
+        while j < len(sorted_data) and current_entry['finsh'] > sorted_data[j]['start']:
+            next_entry = sorted_data[j]
+
+            # Only consider overlaps where 'include' or 'exclude' tags match
+            if ('include' in current_entry and 'include' in next_entry and
+                current_entry['include'] == next_entry['include']) or \
+                    ('exclude' in current_entry and 'exclude' in next_entry and
+                     current_entry['exclude'] == next_entry['exclude']):
+
+                # If the next entry's commit_time is more recent, update the 'finsh' time of the current entry
+                if next_entry['commit_time'] > current_entry['commit_time']:
+                    current_entry['finsh'] = next_entry['start']
+                else:
+                    # the next item has a commit time after the current item in which case it can be ignored
+                    sorted_data.pop(j)
+
+            j += 1
+        # remove commit time
+        if current_entry:
+            current_entry.pop('commit_time', None)
+            resolved_data.append(current_entry)
+
+        i += 1
+
+    return resolved_data
+
+
+def write_yaml(filtered_aggregated_data, file_path):
+    with open(file_path, 'w') as f:
+        f.write("# This data automatically generated by aggregate_historical_data.py\n")
+        yaml.safe_dump({'historical_data': filtered_aggregated_data}, f)
+
+
+def aggregate_data(data_as_dict, key):
+    aggregated_data = []
+    for entry in data_as_dict:
+        for entries in entry[key]:
+            aggregated_data.append(entries)
+    return aggregated_data
+
+
+def create_historical_data(directory):
+    raw_data = load_yaml_files(directory)
+    aggregated_data = aggregate_data(raw_data, 'changes')
+    num_entries_unfiltered = len(aggregated_data)
+    aggregated_data = [entry for entry in aggregated_data if 'start' in entry]
+
+    # Ensuring all 'start' and 'finsh' values are datetime objects
+    for entry in aggregated_data:
+        if isinstance(entry.get('start'), str):
+            entry['start'] = parse(entry['start'])
+        if isinstance(entry.get('finsh'), str):
+            entry['finsh'] = parse(entry['finsh'])
+
+    unique_entries_dict = {}
+
+    for entry in aggregated_data:
+        # Creating a unique key for each entry using a subset of the fields
+        entry_key = (entry['start'], entry['finsh'], entry['stage'], entry.get('include', ''), entry.get('exclude', ''))
+
+        # If this is a newer version of an already seen entry, replace the older version
+        if entry_key not in unique_entries_dict or entry['commit_time'] > unique_entries_dict[entry_key]['commit_time']:
+            unique_entries_dict[entry_key] = entry
+
+    unique_entries = list(unique_entries_dict.values())
+
+    sorted_data = sorted(unique_entries, key=lambda x: (x['start'], -x['commit_time'].timestamp()))
+    num_entries_filtered = len(sorted_data)
+    diff = num_entries_unfiltered - num_entries_filtered
+    # print(f"{diff} entries were incorrectly formatted/duplicates and dropped")
+    resolved_data = resolve_conflicts(sorted_data)
+
+    resolved_data.sort(key=lambda x: x['start'], reverse=True)
+    overlaps = find_overlaps(resolved_data)
+    return resolved_data
+
+
+if __name__ == "__main__":
+    data = create_historical_data("./historical_versions")
+    write_yaml(data, "test_historical.yaml")
diff --git a/src/fetch_historical_data.py b/src/fetch_historical_data.py
@@ -0,0 +1,77 @@
+import os
+import shutil
+import yaml
+from git import Repo
+from datetime import datetime
+import logging
+
+# Configure logging
+logging.basicConfig(filename='file_versions_log.log', level=logging.INFO,
+                    format='%(asctime)s - %(levelname)s - %(message)s')
+REPO_URL = "https://github.com/beyarkay/eskom-calendar.git"
+BRANCH_NAME = "main"
+FILE_NAME = "manually_specified.yaml"
+NEW_DIRECTORY = "historical_versions"
+TEMP_PATH = "eskom-calendar-temp"
+
+
+def get_file_history(repo_name, branch_name, file_name):
+    """Get commit hashes where the file changed."""
+    git_commits = list(repo_name.iter_commits(branch_name, paths=file_name))
+    return git_commits
+
+
+def copy_file_versions(commit_hashes, file_name, new_directory):
+    """Copy versions of the file to a new directory."""
+    if not os.path.exists(new_directory):
+        os.makedirs(new_directory)
+
+    for i, commit in enumerate(commit_hashes):
+        try:
+            # Get the file content at this commit
+            file_content = (commit.tree / file_name).data_stream.read()
+        except Exception as e:
+            logging.error(f"Failed to read file content at commit {commit.hexsha} due to: {str(e)}")
+            continue
+
+        try:
+            # Create a new file with the content at this commit with commit hash and time
+            commit_time = datetime.fromtimestamp(commit.committed_date).isoformat()
+            new_file_name = os.path.join(new_directory,
+                                         f"{os.path.splitext(file_name)[0]}_{commit.hexsha}_{commit_time}.yml")
+            with open(new_file_name, "wb") as f:
+                f.write(file_content)
+        except Exception as e:
+            logging.error(f"Failed to write file content due to: {str(e)}")
+            continue
+
+        logging.info(f"Successfully copied version {i + 1}/{len(commit_hashes)} to {new_file_name}")
+
+
+if __name__ == "__main__":
+    repo = ''
+    try:
+        repo = Repo.clone_from(REPO_URL, TEMP_PATH, branch=BRANCH_NAME)
+    except Exception as e:
+        if os.path.exists(TEMP_PATH):
+            shutil.rmtree(TEMP_PATH)
+        print(f"Cannot clone {REPO_URL} due to {e}")
+        exit(1)
+
+    try:
+        # Get the commits where the file changed
+        commits = get_file_history(repo, BRANCH_NAME, FILE_NAME)
+        copy_file_versions(commits, FILE_NAME, NEW_DIRECTORY)
+        logging.info(f"Copied all versions of {FILE_NAME} to {NEW_DIRECTORY}")
+    except Exception as e:
+        logging.critical(f"Script failed due to: {str(e)}")
+    print(f"Copied all versions of {FILE_NAME} to {NEW_DIRECTORY}")
+
+    # Remove the temporary cloned repository
+    if os.path.exists(TEMP_PATH):
+        try:
+            shutil.rmtree(TEMP_PATH)
+            print(f"Directory {TEMP_PATH} has been removed successfully")
+        except OSError as error:
+            print(error)
+            print(f"Directory {TEMP_PATH} can not be removed")
diff --git a/src/historical_data_test.py b/src/historical_data_test.py
@@ -0,0 +1,37 @@
+import unittest
+from datetime import datetime
+from aggregate_historical_data import create_historical_data
+
+
+class TestOverlapResolution(unittest.TestCase):
+
+    def test_find_overlaps(self):
+        # Define your expected output data
+        expected_output = [
+            {
+                'stage': 6, 'start': datetime.fromisoformat('2023-09-04T22:00:00'),
+                'finsh': datetime.fromisoformat('2023-09-05T05:00:00'),
+                'source': 'https://twitter.com/CityofCT/status/1698744757000831345', 'include': 'coct'
+            },
+            {
+                'stage': 5, 'start': datetime.fromisoformat('2023-09-04T18:00:00'),
+                'finsh': datetime.fromisoformat('2023-09-04T22:00:00'),
+                'source': 'https://twitter.com/CityofCT/status/1698744757000831345', 'include': 'coct'
+            },
+            {
+                'stage': 3, 'start': datetime.fromisoformat('2023-09-04T10:00:00'),
+                'finsh': datetime.fromisoformat('2023-09-04T18:00:00'),
+                'source': 'https://twitter.com/CityofCT/status/1698744757000831345', 'include': 'coct'
+            },
+        ]
+
+        # Call your function and get the output
+        output = create_historical_data("../test-files")  # replace with your actual function name
+
+        # Validate if output is the same as the expected output
+        self.assertEqual(output, expected_output)
+
+
+# Run the tests
+if __name__ == '__main__':
+    unittest.main()