Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Historical data draft #476

Draft
wants to merge 9 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ cssselect2==0.7.0
defusedxml==0.7.1
distro==1.7.0
et-xmlfile==1.1.0
gitdb==4.0.10
GitPython==3.1.34
html5lib==1.1
ics==0.7.2
idna==3.3
Expand All @@ -29,12 +31,14 @@ PyYAML==6.0
reportlab==3.6.11
requests==2.28.1
six==1.16.0
smmap==5.0.0
soupsieve==2.3.2.post1
svglib==1.4.1
tabula-py==2.4.0
TatSu==5.8.3
tinycss2==1.2.1
tomli==2.0.1
typing_extensions==4.7.1
urllib3==1.26.9
webencodings==0.5.1
yamllint==1.27.1
147 changes: 147 additions & 0 deletions src/aggregate_historical_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import datetime
import os
import yaml
from dateutil.parser import parse
import logging

logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename='aggregate_historical_data.log',
filemode='a')
logger = logging.getLogger()
NEW_DIRECTORY = "historical_versions"
AGGREGATED_FILE = "historical_data.yml"


def find_overlaps(sorted_data):
overlaps = []
for i in range(len(sorted_data)):
for j in range(i + 1, len(sorted_data)):
if sorted_data[i]['finsh'] > sorted_data[j]['start']:
overlaps.append((sorted_data[i], sorted_data[j]))
if sorted_data[j]['start'] > sorted_data[i]['finsh']:
break
return overlaps


def find_erroneous_line(file):
with open(file, 'r') as f:
for i, line in enumerate(f, start=1):
try:
yaml.safe_load(line)
except Exception as inner_e:
print(f"Error on line {i}: {inner_e}")
print(f"Line content: {line.strip()}")


def load_yaml_files(directory):
num_file_import_errors = 0
files = sorted(
[os.path.join(directory, f) for f in os.listdir(directory) if f.endswith('.yml') or f.endswith(".yaml")])
loadshedding_data = []
for file in files:
try:
with open(file, 'r') as f:
date_time_str = file.split("_")[-1]
date_time_str = date_time_str.split(".")[0]
data = yaml.safe_load(f)
for change in data["changes"]:
change['commit_time'] = datetime.datetime.fromisoformat(date_time_str)
loadshedding_data.append(data)
except Exception as e:
num_file_import_errors += 1
logger.exception(f"Error in file {file}")
continue
print(f"There were {num_file_import_errors} files that could not be imported. Check aggregate_historical_data.log"
f" for details")
return loadshedding_data


def resolve_conflicts(sorted_data):
resolved_data = []

i = 0
while i < len(sorted_data):
current_entry = sorted_data[i].copy()

j = i + 1
while j < len(sorted_data) and current_entry['finsh'] > sorted_data[j]['start']:
next_entry = sorted_data[j]

# Only consider overlaps where 'include' or 'exclude' tags match
if ('include' in current_entry and 'include' in next_entry and
current_entry['include'] == next_entry['include']) or \
('exclude' in current_entry and 'exclude' in next_entry and
current_entry['exclude'] == next_entry['exclude']):

# If the next entry's commit_time is more recent, update the 'finsh' time of the current entry
if next_entry['commit_time'] > current_entry['commit_time']:
current_entry['finsh'] = next_entry['start']
else:
# the next item has a commit time after the current item in which case it can be ignored
sorted_data.pop(j)

j += 1
# remove commit time
if current_entry:
current_entry.pop('commit_time', None)
resolved_data.append(current_entry)

i += 1

return resolved_data


def write_yaml(filtered_aggregated_data, file_path):
with open(file_path, 'w') as f:
f.write("# This data automatically generated by aggregate_historical_data.py\n")
yaml.safe_dump({'historical_data': filtered_aggregated_data}, f)


def aggregate_data(data_as_dict, key):
aggregated_data = []
for entry in data_as_dict:
for entries in entry[key]:
aggregated_data.append(entries)
return aggregated_data


def create_historical_data(directory):
raw_data = load_yaml_files(directory)
aggregated_data = aggregate_data(raw_data, 'changes')
num_entries_unfiltered = len(aggregated_data)
aggregated_data = [entry for entry in aggregated_data if 'start' in entry]

# Ensuring all 'start' and 'finsh' values are datetime objects
for entry in aggregated_data:
if isinstance(entry.get('start'), str):
entry['start'] = parse(entry['start'])
if isinstance(entry.get('finsh'), str):
entry['finsh'] = parse(entry['finsh'])

unique_entries_dict = {}

for entry in aggregated_data:
# Creating a unique key for each entry using a subset of the fields
entry_key = (entry['start'], entry['finsh'], entry['stage'], entry.get('include', ''), entry.get('exclude', ''))

# If this is a newer version of an already seen entry, replace the older version
if entry_key not in unique_entries_dict or entry['commit_time'] > unique_entries_dict[entry_key]['commit_time']:
unique_entries_dict[entry_key] = entry

unique_entries = list(unique_entries_dict.values())

sorted_data = sorted(unique_entries, key=lambda x: (x['start'], -x['commit_time'].timestamp()))
num_entries_filtered = len(sorted_data)
diff = num_entries_unfiltered - num_entries_filtered
# print(f"{diff} entries were incorrectly formatted/duplicates and dropped")
resolved_data = resolve_conflicts(sorted_data)

resolved_data.sort(key=lambda x: x['start'], reverse=True)
overlaps = find_overlaps(resolved_data)
return resolved_data


if __name__ == "__main__":
data = create_historical_data("./historical_versions")
write_yaml(data, "test_historical.yaml")
77 changes: 77 additions & 0 deletions src/fetch_historical_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import os
import shutil
import yaml
from git import Repo
from datetime import datetime
import logging

# Configure logging
logging.basicConfig(filename='file_versions_log.log', level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s')
REPO_URL = "https://github.com/beyarkay/eskom-calendar.git"
BRANCH_NAME = "main"
FILE_NAME = "manually_specified.yaml"
NEW_DIRECTORY = "historical_versions"
TEMP_PATH = "eskom-calendar-temp"


def get_file_history(repo_name, branch_name, file_name):
"""Get commit hashes where the file changed."""
git_commits = list(repo_name.iter_commits(branch_name, paths=file_name))
return git_commits


def copy_file_versions(commit_hashes, file_name, new_directory):
"""Copy versions of the file to a new directory."""
if not os.path.exists(new_directory):
os.makedirs(new_directory)

for i, commit in enumerate(commit_hashes):
try:
# Get the file content at this commit
file_content = (commit.tree / file_name).data_stream.read()
except Exception as e:
logging.error(f"Failed to read file content at commit {commit.hexsha} due to: {str(e)}")
continue

try:
# Create a new file with the content at this commit with commit hash and time
commit_time = datetime.fromtimestamp(commit.committed_date).isoformat()
new_file_name = os.path.join(new_directory,
f"{os.path.splitext(file_name)[0]}_{commit.hexsha}_{commit_time}.yml")
with open(new_file_name, "wb") as f:
f.write(file_content)
except Exception as e:
logging.error(f"Failed to write file content due to: {str(e)}")
continue

logging.info(f"Successfully copied version {i + 1}/{len(commit_hashes)} to {new_file_name}")


if __name__ == "__main__":
repo = ''
try:
repo = Repo.clone_from(REPO_URL, TEMP_PATH, branch=BRANCH_NAME)
except Exception as e:
if os.path.exists(TEMP_PATH):
shutil.rmtree(TEMP_PATH)
print(f"Cannot clone {REPO_URL} due to {e}")
exit(1)

try:
# Get the commits where the file changed
commits = get_file_history(repo, BRANCH_NAME, FILE_NAME)
copy_file_versions(commits, FILE_NAME, NEW_DIRECTORY)
logging.info(f"Copied all versions of {FILE_NAME} to {NEW_DIRECTORY}")
except Exception as e:
logging.critical(f"Script failed due to: {str(e)}")
print(f"Copied all versions of {FILE_NAME} to {NEW_DIRECTORY}")

# Remove the temporary cloned repository
if os.path.exists(TEMP_PATH):
try:
shutil.rmtree(TEMP_PATH)
print(f"Directory {TEMP_PATH} has been removed successfully")
except OSError as error:
print(error)
print(f"Directory {TEMP_PATH} can not be removed")
37 changes: 37 additions & 0 deletions src/historical_data_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import unittest
from datetime import datetime
from aggregate_historical_data import create_historical_data


class TestOverlapResolution(unittest.TestCase):

def test_find_overlaps(self):
# Define your expected output data
expected_output = [
{
'stage': 6, 'start': datetime.fromisoformat('2023-09-04T22:00:00'),
'finsh': datetime.fromisoformat('2023-09-05T05:00:00'),
'source': 'https://twitter.com/CityofCT/status/1698744757000831345', 'include': 'coct'
},
{
'stage': 5, 'start': datetime.fromisoformat('2023-09-04T18:00:00'),
'finsh': datetime.fromisoformat('2023-09-04T22:00:00'),
'source': 'https://twitter.com/CityofCT/status/1698744757000831345', 'include': 'coct'
},
{
'stage': 3, 'start': datetime.fromisoformat('2023-09-04T10:00:00'),
'finsh': datetime.fromisoformat('2023-09-04T18:00:00'),
'source': 'https://twitter.com/CityofCT/status/1698744757000831345', 'include': 'coct'
},
]

# Call your function and get the output
output = create_historical_data("../test-files") # replace with your actual function name

# Validate if output is the same as the expected output
self.assertEqual(output, expected_output)


# Run the tests
if __name__ == '__main__':
unittest.main()
Loading