-
Notifications
You must be signed in to change notification settings - Fork 55
/
Copy pathapi_calls.py
147 lines (123 loc) · 5.28 KB
/
api_calls.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
"""Contains helper functions to extract data from spreadsheet API"""
from typing import Any, List, Tuple
from tenacity import retry, stop_after_attempt, wait_exponential, retry_if_exception
from dlt.common.exceptions import MissingDependencyException
from dlt.common.typing import DictStrAny
from dlt.sources.credentials import GcpCredentials, GcpOAuthCredentials
from dlt.sources.helpers.requests.retry import DEFAULT_RETRY_STATUS
from .data_processing import ParsedRange, trim_range_top_left
try:
from apiclient.discovery import build, Resource
except ImportError:
raise MissingDependencyException("Google API Client", ["google-api-python-client"])
def is_retry_status_code(exception: BaseException) -> bool:
"""Retry condition on HttpError"""
from googleapiclient.errors import HttpError # type: ignore
# print(f"RETRY ON {str(HttpError)} = {isinstance(exception, HttpError) and exception.resp.status in DEFAULT_RETRY_STATUS}")
# if isinstance(exception, HttpError):
# print(exception.resp.status)
# print(DEFAULT_RETRY_STATUS)
return (
isinstance(exception, HttpError)
and exception.resp.status in DEFAULT_RETRY_STATUS
)
retry_deco = retry(
# Retry if it's a rate limit error (HTTP 429)
retry=retry_if_exception(is_retry_status_code),
# Use exponential backoff for the waiting time between retries, starting with 5 seconds
wait=wait_exponential(multiplier=1.5, min=5, max=120),
# Stop retrying after 10 attempts
stop=stop_after_attempt(10),
# Print out the retrying details
reraise=True,
)
def api_auth(credentials: GcpCredentials, max_api_retries: int) -> Resource:
"""
Uses GCP credentials to authenticate with Google Sheets API.
Args:
credentials (GcpCredentials): Credentials needed to log in to GCP.
max_api_retries (int): Max number of retires to google sheets API. Actual behavior is internal to google client.
Returns:
Resource: Object needed to make API calls to Google Sheets API.
"""
if isinstance(credentials, GcpOAuthCredentials):
credentials.auth("https://www.googleapis.com/auth/spreadsheets.readonly")
# Build the service object for Google sheets api.
service = build(
"sheets",
"v4",
credentials=credentials.to_native_credentials(),
num_retries=max_api_retries,
)
return service
@retry_deco
def get_meta_for_ranges(
service: Resource, spreadsheet_id: str, range_names: List[str]
) -> Any:
"""Retrieves `spreadsheet_id` cell metadata for `range_names`"""
return (
service.spreadsheets()
.get(
spreadsheetId=spreadsheet_id,
ranges=range_names,
includeGridData=True,
)
.execute()
)
@retry_deco
def get_known_range_names(
spreadsheet_id: str, service: Resource
) -> Tuple[List[str], List[str], str]:
"""
Retrieves spreadsheet metadata and extracts a list of sheet names and named ranges
Args:
spreadsheet_id (str): The ID of the spreadsheet.
service (Resource): Resource object used to make API calls to Google Sheets API.
Returns:
Tuple[List[str], List[str], str] sheet names, named ranges, spreadheet title
"""
metadata = service.spreadsheets().get(spreadsheetId=spreadsheet_id).execute()
sheet_names: List[str] = [s["properties"]["title"] for s in metadata["sheets"]]
named_ranges: List[str] = [r["name"] for r in metadata.get("namedRanges", {})]
title: str = metadata["properties"]["title"]
return sheet_names, named_ranges, title
@retry_deco
def get_data_for_ranges(
service: Resource, spreadsheet_id: str, range_names: List[str]
) -> List[Tuple[str, ParsedRange, ParsedRange, List[List[Any]]]]:
"""
Calls Google Sheets API to get data in a batch. This is the most efficient way to get data for multiple ranges inside a spreadsheet.
Args:
service (Resource): Object to make API calls to Google Sheets.
spreadsheet_id (str): The ID of the spreadsheet.
range_names (List[str]): List of range names.
Returns:
List[DictStrAny]: A list of ranges with data in the same order as `range_names`
"""
range_batch_resp = (
service.spreadsheets()
.values()
.batchGet(
spreadsheetId=spreadsheet_id,
ranges=range_names,
# un formatted returns typed values
valueRenderOption="UNFORMATTED_VALUE",
# will return formatted dates as a serial number
dateTimeRenderOption="SERIAL_NUMBER",
)
.execute()
)
# if there are not ranges to be loaded, there's no "valueRanges"
range_batch: List[DictStrAny] = range_batch_resp.get("valueRanges", [])
# trim the empty top rows and columns from the left
rv = []
for name, range_ in zip(range_names, range_batch):
parsed_range = ParsedRange.parse_range(range_["range"])
values: List[List[Any]] = range_.get("values", None)
if values:
parsed_range, values = trim_range_top_left(parsed_range, values)
# create a new range to get first two rows
meta_range = parsed_range._replace(end_row=parsed_range.start_row + 1)
# print(f"{name}:{parsed_range}:{meta_range}")
rv.append((name, parsed_range, meta_range, values))
return rv