Skip to content

Commit

Permalink
fix: downgrade boto version - explicite errors (#324)
Browse files Browse the repository at this point in the history
  • Loading branch information
polomarcus authored Jan 24, 2025
1 parent 86de705 commit 7748a44
Show file tree
Hide file tree
Showing 7 changed files with 59 additions and 47 deletions.
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ services:
# SENTRY_DSN: prod_only
#END_DATE: "2024-02-29" # optional - otherwise end of the month
# START_DATE: 1727610071 # to test batch import
CHANNEL : fr3-idf # to reimport only one channel
#CHANNEL : fr3-idf # to reimport only one channel
MEDIATREE_USER : /run/secrets/username_api
MEDIATREE_PASSWORD: /run/secrets/pwd_api
BUCKET: /run/secrets/bucket
Expand Down
50 changes: 25 additions & 25 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ s3fs = {extras = ["boto3"], version = ">=2023.12.0"}
boto3 = "*"
botocore = "*"
python = ">=3.11,<=3.13"
s3transfer = "0.10.4"
pandas = "^2.2.3"
advertools = "^0.14.1"
xmltodict = "^0.13.0"
Expand Down
40 changes: 21 additions & 19 deletions quotaclimat/data_processing/mediatree/s3/api_to_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,20 +36,22 @@
USER = get_user()
KEYWORDS_URL = get_keywords_url()
# Configuration for Scaleway Object Storage
ACCESS_KEY = os.environ.get("BUCKET")
ACCESS_KEY = os.environ.get('BUCKET')
SECRET_KEY = os.environ.get("BUCKET_SECRET")
BUCKET_NAME = os.environ.get("BUCKET_NAME")
REGION = 'fr-par'

ENDPOINT_URL = f'https://s3.{REGION}.scw.cloud'

s3_client = boto3.client(
service_name='s3',
region_name=REGION,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
endpoint_url=ENDPOINT_URL,
)
def get_s3_client():
s3_client = boto3.client(
service_name='s3',
region_name=REGION,
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
endpoint_url=ENDPOINT_URL,
)
return s3_client

def get_bucket_key(date, channel, filename:str="*", suffix:str="parquet"):
(year, month, day) = (date.year, date.month, date.day)
Expand All @@ -60,7 +62,7 @@ def get_bucket_key_folder(date, channel):
return f'year={year}/month={month:1}/day={day:1}/channel={channel}/'

# Function to upload folder to S3
def upload_folder_to_s3(local_folder, bucket_name, base_s3_path):
def upload_folder_to_s3(local_folder, bucket_name, base_s3_path, s3_client):
logging.info(f"Reading local folder {local_folder} and uploading to S3")
for root, _, files in os.walk(local_folder):
logging.info(f"Reading files {len(files)}")
Expand All @@ -77,7 +79,7 @@ def upload_folder_to_s3(local_folder, bucket_name, base_s3_path):
shutil.rmtree(local_folder)
logging.info(f"Deleted local folder: {local_folder}")

def save_to_s3(df: pd.DataFrame, channel: str, date: pd.Timestamp):
def save_to_s3(df: pd.DataFrame, channel: str, date: pd.Timestamp, s3_client):
logging.info(f"Saving DF with {len(df)} elements to S3 for {date} and channel {channel}")

# to create partitions
Expand All @@ -100,13 +102,13 @@ def save_to_s3(df: pd.DataFrame, channel: str, date: pd.Timestamp):
#saving full_path folder parquet to s3
s3_path = f"{get_bucket_key_folder(date, channel)}"
local_folder = f"{based_path}/{s3_path}"
upload_folder_to_s3(local_folder, BUCKET_NAME, s3_path)
upload_folder_to_s3(local_folder, BUCKET_NAME, s3_path,s3_client=s3_client)

except Exception as e:
logging.error(Exception)
exit()
except Exception as err:
logging.fatal("get_and_save_api_data (%s) %s" % (type(err).__name__, err))
sys.exit(1)

def check_if_object_exists_in_s3(day, channel):
def check_if_object_exists_in_s3(day, channel, s3_client):
folder_prefix = get_bucket_key_folder(day, channel) # Adjust this to return the folder path

logging.debug(f"Checking if folder exists: {folder_prefix}")
Expand All @@ -116,7 +118,7 @@ def check_if_object_exists_in_s3(day, channel):
logging.info(f"Folder exists in S3: {folder_prefix}")
return True
else:
logging.debug(f"Folder does not exist in S3: {folder_prefix}")
logging.info(f"Folder does not exist in S3: {folder_prefix}")
return False
except Exception as e:
logging.error(f"Error while checking folder in S3: {folder_prefix}\n{e}")
Expand All @@ -126,7 +128,7 @@ async def get_and_save_api_data(exit_event):
with sentry_sdk.start_transaction(op="task", name="get_and_save_api_data"):
try:
logging.warning(f"Available CPUS {os.cpu_count()} - MODIN_CPUS config : {os.environ.get('MODIN_CPUS', 3)}")

s3_client = get_s3_client()
token=get_auth_token(password=password, user_name=USER)
type_sub = 's2t'
start_date = int(os.environ.get("START_DATE", 0))
Expand All @@ -144,7 +146,7 @@ async def get_and_save_api_data(exit_event):
df_res = pd.DataFrame()

# if object already exists, skip
if not check_if_object_exists_in_s3(day, channel):
if not check_if_object_exists_in_s3(day, channel,s3_client=s3_client):
try:
programs_for_this_day = get_programs_for_this_day(day.tz_localize("Europe/Paris"), channel, df_programs)

Expand All @@ -162,7 +164,7 @@ async def get_and_save_api_data(exit_event):
logging.info("Nothing to extract")

# save to S3
save_to_s3(df_res, channel, day)
save_to_s3(df_res, channel, day, s3_client=s3_client)

except Exception as err:
logging.error(f"continuing loop but met error : {err}")
Expand Down
2 changes: 1 addition & 1 deletion quotaclimat/data_processing/mediatree/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def get_date_range(start_date_to_query, end_epoch, minus_days:int=1):
return range
else:
logging.info(f"Default date range from yesterday to {minus_days} day(s) - (env var NUMBER_OF_PREVIOUS_DAYS)")
range = pd.date_range(start=get_datetime_yesterday(), periods=minus_days, freq="D")
range = pd.date_range(end=get_datetime_yesterday(), periods=minus_days, freq="D")
return range

def is_it_tuesday(date):
Expand Down
2 changes: 1 addition & 1 deletion test/s3/test_s3.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import pytest
import pandas as pd
from quotaclimat.data_processing.mediatree.s3.api_to_s3 import get_bucket_key, save_to_s3, get_bucket_key_folder
from quotaclimat.data_processing.mediatree.s3.api_to_s3 import get_bucket_key, get_bucket_key_folder


def test_get_bucket_key():
Expand Down
9 changes: 9 additions & 0 deletions test/sitemap/test_mediatree_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,3 +66,12 @@ def test_get_start_end_date_with_get_date_range():
output = get_date_range(start,end)
assert len(output) == number_of_previous_days + 1
pd.testing.assert_index_equal(output, expected)

def test_get_start_end_date_with_get_date_range_default():
start_date = 0
number_of_previous_days = 7
(start,end) = get_start_end_date_env_variable_with_default(start_date, minus_days=number_of_previous_days)


output = get_date_range(start,end, minus_days=number_of_previous_days)
assert len(output) == number_of_previous_days

1 comment on commit 7748a44

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
postgres
   insert_data.py43784%36–38, 57–59, 64
   insert_existing_data_example.py19384%25–27
postgres/schemas
   models.py1972189%152–153, 156–163, 172, 179, 181–182, 247–248, 251–256, 267, 274–275
quotaclimat/data_ingestion
   scrap_sitemap.py1341787%27–28, 33–34, 66–71, 95–97, 138–140, 202, 223–228
quotaclimat/data_ingestion/ingest_db
   ingest_sitemap_in_db.py553733%21–42, 45–58, 62–73
quotaclimat/data_ingestion/scrap_html
   scrap_description_article.py36392%19–20, 32
quotaclimat/data_processing/mediatree
   api_import.py24915339%44–48, 54–89, 93–96, 102, 111, 117, 120–122, 125–172, 178–193, 198, 211–223, 227–233, 247–259, 262–266, 272, 317–318, 321–352, 355–357
   channel_program.py1625765%21–23, 34–36, 53–54, 57–59, 98–99, 108, 124, 175–216
   config.py15287%7, 16
   detect_keywords.py2571196%126–127, 283, 351–358, 400
   update_pg_keywords.py896725%16–19, 22–31, 36–157, 181, 184, 188–189, 199–220, 254–291, 298
   utils.py962970%29–53, 56, 65, 135–138, 142–149
quotaclimat/data_processing/mediatree/s3
   api_to_s3.py15410333%47–54, 66–80, 83–109, 112–125, 128–178, 181–207, 210–212
quotaclimat/data_processing/mediatree/stop_word
   main.py16010634%40–44, 70, 87–107, 117–183, 187–251, 255–284, 288–327, 330–332
quotaclimat/utils
   healthcheck_config.py291452%22–24, 27–38
   logger.py241154%22–24, 28–37
   sentry.py11282%22–23
TOTAL175764363% 

Tests Skipped Failures Errors Time
112 0 💤 0 ❌ 0 🔥 2m 37s ⏱️

Please sign in to comment.