Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fastapi based API #45

Merged
merged 52 commits into from
Sep 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
52 commits
Select commit Hold shift + click to select a range
a7d70df
Add fastapi dependency
hellais Nov 21, 2023
ddf5906
Add vunicorn dep
hellais Nov 21, 2023
f4098d6
Create fastapi boilerplate
hellais Nov 21, 2023
18006a3
Add deps
hellais Nov 21, 2023
3a3e1d1
Implement basic list_measurements lookup function
hellais Nov 21, 2023
a87250e
Add support for filtering measurements and data format returned value
hellais Nov 21, 2023
1fcb310
fix poetry lock
hellais Nov 21, 2023
cc97179
Implement aggregation and measurement list API
hellais Nov 22, 2023
cacc5cf
Add conversion to test_names to groups
hellais Nov 22, 2023
332372f
Fix missing import
hellais Nov 22, 2023
4748d83
Fix column name mapping
hellais Nov 22, 2023
8ba7f83
Add order by
hellais Nov 22, 2023
5dd30c4
Add optional extra keys
hellais Nov 22, 2023
672519a
Fix bug in blocked counts
hellais Nov 22, 2023
eb57c67
Fix bug in agg loni value counts
hellais Nov 22, 2023
75c2081
Implement ghetto aggregate loni likelyhood space
hellais Nov 22, 2023
ba4b360
Fix division by zero case
hellais Nov 22, 2023
28adaff
Fix sorting of columns
hellais Nov 22, 2023
c73b506
Add basic tests for fastapi
hellais Nov 23, 2023
32c242f
Move fastapi tests to top level tests/ folder
hellais Nov 23, 2023
740197b
Add logging for make analysis
hellais Nov 24, 2023
d64463f
Implement list_observations API endpoint
hellais Dec 8, 2023
22f7662
Add fastapi dependency
hellais Nov 21, 2023
2c35916
Add vunicorn dep
hellais Nov 21, 2023
19fba49
Create fastapi boilerplate
hellais Nov 21, 2023
96b4de8
Add deps
hellais Nov 21, 2023
3cc6a29
Implement basic list_measurements lookup function
hellais Nov 21, 2023
7bab458
Add support for filtering measurements and data format returned value
hellais Nov 21, 2023
5144579
fix poetry lock
hellais Nov 21, 2023
209ebe9
Implement aggregation and measurement list API
hellais Nov 22, 2023
9bf365c
Add conversion to test_names to groups
hellais Nov 22, 2023
f32e3c1
Fix missing import
hellais Nov 22, 2023
f9eff96
Fix column name mapping
hellais Nov 22, 2023
9bf0977
Add order by
hellais Nov 22, 2023
9ff1bff
Add optional extra keys
hellais Nov 22, 2023
dae10c7
Fix bug in blocked counts
hellais Nov 22, 2023
9817fac
Fix bug in agg loni value counts
hellais Nov 22, 2023
1ebbec2
Implement ghetto aggregate loni likelyhood space
hellais Nov 22, 2023
0664d16
Fix division by zero case
hellais Nov 22, 2023
ae83b82
Fix sorting of columns
hellais Nov 22, 2023
1c9415c
Add basic tests for fastapi
hellais Nov 23, 2023
aa90acf
Move fastapi tests to top level tests/ folder
hellais Nov 23, 2023
be5a1c5
Implement list_observations API endpoint
hellais Dec 8, 2023
91eddd0
Merge branch 'fastapi' of github.com:ooni/data into fastapi
hellais Aug 30, 2024
112839a
delete poetry and pyproject
hellais Aug 30, 2024
2df795b
Merge branch 'v5.0.0-rc.0' into fastapi
hellais Aug 30, 2024
1afe7ec
Move fastapi to API directory
hellais Aug 30, 2024
9132397
Implement observation aggregation endpoints
hellais Sep 2, 2024
3fd7413
Implement observable framework dataviz for observation by failure
hellais Sep 2, 2024
677ab00
Implement a simplified view
hellais Sep 2, 2024
1f356e0
Update clickhouse config
hellais Sep 2, 2024
3fb37b1
Update endpoints to point to data.ooni.org
hellais Sep 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,6 @@
"python.defaultInterpreterPath": "${workspaceFolder}/oonipipeline/.venv",
"[python]": {
"editor.defaultFormatter": "ms-python.black-formatter"
}
},
"editor.formatOnSaveMode": "modifications"
}
1 change: 1 addition & 0 deletions oonipipeline/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ path = "src/oonipipeline/__about__.py"
[tool.hatch.envs.default.scripts]
oonipipeline = "python -m oonipipeline.main {args}"
dataviz = "uvicorn oonipipeline.dataviz.main:app {args}"
api = "uvicorn oonipipeline.api.main:app {args}"
test = "pytest {args:tests}"
# --full-trace --log-level=INFO --log-cli-level=INFO -v --setup-show -s
test-cov = "pytest --cov=./ --cov-report=xml --cov-report=html --cov-report=term {args:tests}"
Expand Down
Empty file.
11 changes: 11 additions & 0 deletions oonipipeline/src/oonipipeline/api/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from pydantic_settings import BaseSettings


class Settings(BaseSettings):
app_name: str = "OONI Data API"
base_url: str = "https://api.ooni.io"
clickhouse_url: str = "clickhouse://localhost"
log_level: str = "info"


settings = Settings()
7 changes: 7 additions & 0 deletions oonipipeline/src/oonipipeline/api/dependencies.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from clickhouse_driver import Client as ClickhouseClient

from .config import settings


def get_clickhouse_client() -> ClickhouseClient:
return ClickhouseClient.from_url(settings.clickhouse_url)
28 changes: 28 additions & 0 deletions oonipipeline/src/oonipipeline/api/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware

from .routers import aggregate_analysis, list_analysis, aggregate_observations
from .config import settings

import logging

logging.basicConfig(level=getattr(logging, settings.log_level.upper()))

app = FastAPI()
origins = ["*"]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)

app.include_router(aggregate_analysis.router, prefix="/api/v1")
app.include_router(list_analysis.router, prefix="/api/v1")
app.include_router(aggregate_observations.router, prefix="/api/v2")


@app.get("/")
async def root():
return {"message": "Hello OONItarian!"}
Empty file.
302 changes: 302 additions & 0 deletions oonipipeline/src/oonipipeline/api/routers/aggregate_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,302 @@
from datetime import date, datetime, timedelta, timezone
from typing import List, Literal, Optional, Union, Dict
from typing_extensions import Annotated
from fastapi import APIRouter, Depends, Query
from pydantic import BaseModel

from oonidata.datautils import PerfTimer

from .utils import get_measurement_start_day_agg, TimeGrains
from ..dependencies import ClickhouseClient, get_clickhouse_client
from .list_analysis import (
OONI_DATA_COLS_REMAP,
OONI_DATA_COLS_REMAP_INV,
SinceUntil,
test_name_to_group,
utc_30_days_ago,
utc_today,
)

import logging

log = logging.getLogger(__name__)

router = APIRouter()

AggregationKeys = Literal[
"measurement_start_day",
"domain",
"probe_cc",
"probe_asn",
"test_name",
]


class DBStats(BaseModel):
bytes: int
elapsed_seconds: float
row_count: int
total_row_count: int


class AggregationEntry(BaseModel):
anomaly_count: int
confirmed_count: int
failure_count: int
ok_count: int
measurement_count: int

observation_count: int
vantage_point_count: int
measurement_start_day: date
loni_down_map: Dict[str, float]
loni_down_value: float
loni_blocked_map: Dict[str, float]
loni_blocked_value: float
# loni_ok_map: Dict[str, float]
loni_ok_value: float

domain: Optional[str] = None
probe_cc: Optional[str] = None
probe_asn: Optional[int] = None
test_name: Optional[str] = None


class AggregationResponse(BaseModel):
# TODO(arturo): these keys are inconsistent with the other APIs
db_stats: DBStats
dimension_count: int
result: List[AggregationEntry]


@router.get("/aggregation", tags=["aggregation"])
async def get_aggregation(
db: Annotated[ClickhouseClient, Depends(get_clickhouse_client)],
axis_x: Annotated[AggregationKeys, Query()] = "measurement_start_day",
axis_y: Annotated[Optional[AggregationKeys], Query()] = None,
category_code: Annotated[Optional[str], Query()] = None,
test_name: Annotated[Optional[str], Query()] = None,
domain: Annotated[Optional[str], Query()] = None,
input: Annotated[Optional[str], Query()] = None,
probe_asn: Annotated[Union[int, str, None], Query()] = None,
probe_cc: Annotated[Optional[str], Query(min_length=2, max_length=2)] = None,
ooni_run_link_id: Annotated[Optional[str], Query()] = None,
since: SinceUntil = utc_30_days_ago(),
until: SinceUntil = utc_today(),
time_grain: Annotated[TimeGrains, Query()] = "day",
anomaly_sensitivity: Annotated[float, Query()] = 0.7,
format: Annotated[Literal["JSON", "CSV"], Query()] = "JSON",
download: Annotated[bool, Query()] = False,
) -> AggregationResponse:
q_args = {}
and_clauses = []
extra_cols = {}
dimension_count = 1
if axis_x == "measurement_start_day":
# TODO(arturo): wouldn't it be nicer if we dropped the time_grain
# argument and instead used axis_x IN (measurement_start_day,
# measurement_start_hour, ..)?
extra_cols["measurement_start_day"] = (
f"{get_measurement_start_day_agg(time_grain)} as measurement_start_day"
)
elif axis_x:
col = OONI_DATA_COLS_REMAP.get(axis_x)
extra_cols[axis_x] = f"{col} as {axis_x}"

if probe_asn is not None:
if isinstance(probe_asn, str) and probe_asn.startswith("AS"):
probe_asn = int(probe_asn[2:])
q_args["probe_asn"] = probe_asn
and_clauses.append("location_network_asn = %(probe_asn)d")
extra_cols["probe_asn"] = "location_network_asn as probe_asn"
if probe_cc is not None:
q_args["probe_cc"] = probe_cc
and_clauses.append("location_network_cc = %(probe_cc)s")
extra_cols["probe_cc"] = "location_network_cc as probe_cc"
if test_name is not None:
q_args["test_name"] = test_name_to_group(test_name)
and_clauses.append("target_nettest_group = %(test_name)s")
extra_cols["test_name"] = "target_nettest_group as test_name"
if category_code is not None:
q_args["category_code"] = category_code
and_clauses.append("target_category_code = %(category_code)s")
extra_cols["category_code"] = "target_category_code as category_code"
if domain is not None:
q_args["domain"] = domain
and_clauses.append("target_domain_name = %(domain)s")
extra_cols["domain"] = "target_domain_name as domain"
if input is not None:
# XXX
pass

if axis_y:
dimension_count += 1
if axis_y == "measurement_start_day":
# TODO(arturo): wouldn't it be nicer if we dropped the time_grain
# argument and instead used axis_x IN (measurement_start_day,
# measurement_start_hour, ..)?
extra_cols["measurement_start_day"] = (
f"{get_measurement_start_day_agg(time_grain)} as measurement_start_day"
)
else:
col = OONI_DATA_COLS_REMAP_INV.get(axis_y)
extra_cols[axis_y] = f"{col} as {axis_y}"

if since is not None:
q_args["since"] = since
and_clauses.append("timeofday >= %(since)s")
if until is not None:
and_clauses.append("timeofday <= %(until)s")
q_args["until"] = until

q_args["anomaly_sensitivity"] = anomaly_sensitivity

"""
if anomaly is True:
and_clauses.append("arraySum(loni_blocked_values) > 0.5")
elif anomaly is False:
and_clauses.append("arraySum(loni_blocked_values) <= 0.5")

if confirmed is True:
and_clauses.append("arraySum(loni_blocked_values) == 1.0")

if failure is False:
# TODO(arturo): how do we map this onto failure?
pass
"""

where = ""
if len(and_clauses) > 0:
where += " WHERE "
where += " AND ".join(and_clauses)

# TODO(arturo): the sort of this matters. We should be smarter.
base_cols = [
"loni_down_map",
"loni_blocked_map",
"loni_ok_value",
"loni_down_value",
"loni_blocked_value",
"measurement_count",
"observation_count",
"vantage_point_count",
"confirmed_count",
"anomaly_count",
]

q = f"""
WITH
loni_blocked_weight_avg_map as loni_blocked_map,
loni_down_weight_avg_map as loni_down_map,
arraySum(mapValues(loni_blocked_map)) as loni_blocked_value_avg,
arraySum(mapValues(loni_down_map)) as loni_down_value_avg,
loni_ok_weight_avg_value as loni_ok_value_avg,

loni_ok_value_avg + loni_down_value_avg + loni_blocked_value_avg as loni_total

SELECT

loni_down_map,
loni_blocked_map,

-- TODO(arturo): this is a bit ghetto
loni_ok_value_avg / loni_total as loni_ok_value,
loni_down_value_avg / loni_total as loni_down_value,
loni_blocked_value_avg / loni_total as loni_blocked_value,

measurement_count_agg as measurement_count,
observation_count_agg as observation_count,
vantage_point_count,

confirmed_count,
anomaly_count,

-- Extra columns
{", ".join(extra_cols.keys())}

FROM (
WITH
CAST((loni_down_keys, loni_down_values), 'Map(String, Float64)') as loni_down_map,
CAST((loni_blocked_keys, loni_blocked_values), 'Map(String, Float64)') as loni_blocked_map
SELECT

sumMap(loni_down_map) as loni_down_sum,
countMap(loni_down_map) as loni_down_cnt,
arraySum(mapValues(loni_down_cnt)) as loni_down_cnt_total,
arraySum(mapValues(loni_down_sum)) as loni_down_value_total,
mapApply(
(k, v) -> (
k,
if(
loni_down_cnt_total == 0 or loni_down_value_total == 0, 0,
toFloat64(v) / toFloat64(loni_down_value_total) * toFloat64(loni_down_cnt[k])/toFloat64(loni_down_cnt_total)
)
),
loni_down_sum
) as loni_down_weight_avg_map,

sumMap(loni_blocked_map) as loni_blocked_sum,
countMap(loni_blocked_map) as loni_blocked_cnt,
arraySum(mapValues(loni_blocked_cnt)) as loni_blocked_cnt_total,
arraySum(mapValues(loni_blocked_sum)) as loni_blocked_value_total,
mapApply(
(k, v) -> (
k,
if(
loni_blocked_cnt_total == 0 or loni_blocked_value_total == 0, 0,
toFloat64(v) / toFloat64(loni_blocked_value_total) * toFloat64(loni_blocked_cnt[k]) / toFloat64(loni_blocked_cnt_total)
)
),
loni_blocked_sum
) as loni_blocked_weight_avg_map,

sum(loni_ok_value) as loni_ok_total,
COUNT() as loni_ok_cnt,
loni_ok_total/loni_ok_cnt as loni_ok_weight_avg_value,

SUM(measurement_count) as measurement_count_agg,
SUM(observation_count) as observation_count_agg,
COUNT(DISTINCT
location_network_type,
location_network_asn,
location_network_cc,
location_resolver_asn
) as vantage_point_count,

sumIf(measurement_count, arraySum(loni_blocked_values) == 1) as confirmed_count,
sumIf(measurement_count, arraySum(loni_blocked_values) >= %(anomaly_sensitivity)f) as anomaly_count,

-- Extra columns
{", ".join(extra_cols.values())}

FROM measurement_experiment_result
{where}
GROUP BY {", ".join(extra_cols.keys())}
ORDER BY {", ".join(extra_cols.keys())}
)
"""

cols = base_cols + list(extra_cols.keys())
t = PerfTimer()
log.info(f"running query {q} with {q_args}")
rows = db.execute(q, q_args)

results: List[AggregationEntry] = []
if rows and isinstance(rows, list):
for row in rows:
print(row)
d = dict(zip(cols, row))
d["failure_count"] = 0
d["ok_count"] = d["measurement_count"] - d["anomaly_count"]
log.debug(f"adding {d}")
results.append(AggregationEntry(**d))
return AggregationResponse(
db_stats=DBStats(
bytes=-1,
elapsed_seconds=t.s,
row_count=len(results),
total_row_count=len(results),
),
dimension_count=dimension_count,
result=results,
)
Loading
Loading