Skip to content

Commit

Permalink
feat: madenearme speedup (#9570)
Browse files Browse the repository at this point in the history
* feat: optimize generate_madenearme for speed
* chore: use JSON XS to speed up generate madenearme
* feat: regenerate madenearme every sunday
* test: more sensible madenearme test
  • Loading branch information
alexgarel authored Dec 21, 2023
1 parent ab4ed7d commit 998ddb4
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 14 deletions.
1 change: 1 addition & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ RUN --mount=type=cache,id=apt-cache,target=/var/cache/apt set -x && \
libdata-validate-ip-perl \
libio-compress-perl \
libjson-maybexs-perl \
libcpanel-json-xs-perl \
liblist-allutils-perl \
liblist-someutils-perl \
# GraphViz2
Expand Down
2 changes: 2 additions & 0 deletions cpanfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ requires 'MIME::Base32';
requires 'Cache::Memcached::Fast'; #libcache-memcached-fast-perl
requires 'JSON'; # libjson-perl
requires 'JSON::PP'; # libjson-pp-perl
requires 'Cpanel::JSON::XS'; # libcpanel-json-xs-perl - fast parsing
requires 'JSON::MaybeXS'; # libjson-maybexs-perl
requires 'Clone'; # libclone-perl
requires 'Crypt::PasswdMD5'; # libcrypt-passwdmd5-perl
requires 'Encode::Detect'; # libencode-detect-perl
Expand Down
5 changes: 5 additions & 0 deletions scripts/gen_feeds_daily_off.sh
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,8 @@ cd /srv/off

./scripts/export_csv_file.pl --fields code,nutrition_grades_tags --separator ';' > $OFF_PUBLIC_DATA_DIR/exports/nutriscore.csv

# On sunday, generates madenearme
if [ "$(date +%u)" = "7" ]
then
./scripts/gen_madenearme_pages.sh
fi
47 changes: 35 additions & 12 deletions scripts/generate_madenearme_page.pl
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,8 @@
use URI::Escape::XS;
use Storable qw/dclone/;
use Encode;
use JSON::PP;
use Getopt::Long;
use JSON::MaybeXS;
use Log::Any qw($log);

use ProductOpener::Lang qw/:all/;
Expand All @@ -62,7 +63,7 @@ ($cc)

# parse the JSONL to find all products for country with emb_codes_tags
# return an iterator
sub iter_products_from_jsonl ($jsonl_path, $country) {
sub iter_products_from_jsonl ($jsonl_path, $country, $verbose = undef) {
my $jsonl;
if ($jsonl_path =~ /\.gz$/) {
open($jsonl, "-|", "gunzip -c $jsonl_path") or die("can’t open pipe to $jsonl_path");
Expand All @@ -72,19 +73,36 @@ ($jsonl_path, $country)
or die("$jsonl_path not found\n");
}
my $is_world = $country eq "en:world";
my $line_count = 0;
my $product_count = 0;
my $start = time();
# iterator
return sub {
while (my $line = <$jsonl>) {
# quickly verify we have emb_codes_tags without parsing json
next unless $line =~ /emb_codes_tags/;
if ($verbose && !($line_count % 100000)) {
my $t = time() - $start;
print("$line_count lines processed ($product_count products) in $t seconds\n");
}
$line_count++;
# quickly verify we have emb_codes_tags and countries_tags
# without parsing json as it is slow
my @emb_code_tags = ();
my @countries_tags = ();
if ($line =~ /emb_codes_tags["'] *: *(\[[^\]]+\])/) {
@emb_code_tags = @{decode_json($1)};
if ($line =~ /countries_tags["'] *: *(\[[^\]]+\])/) {
@countries_tags = @{decode_json($1)};
}
}
my $product_ref;
eval {
$product_ref = decode_json($line);
1;
} or next;
if ( (defined $product_ref->{emb_codes_tags})
&& ($is_world || (grep {$_ eq $country} @{$product_ref->{countries_tags}})))
if ( (scalar @emb_code_tags)
&& ($is_world || (grep {$_ eq $country} @countries_tags)))
{
eval {
$product_ref = decode_json($line);
1;
} or next;
$product_count++;
return $product_ref;
}
}
Expand All @@ -93,6 +111,11 @@ ($jsonl_path, $country)
};
}

my $usage = "Usage: $0 <country code (or world)> <language code> [--verbose]\n";
# --verbose option
my $verbose = undef;
GetOptions("verbose" => \$verbose) or die($usage);

$cc = $ARGV[0];
$lc = $ARGV[1];
$subdomain = $cc;
Expand All @@ -103,7 +126,7 @@ ($jsonl_path, $country)
$lang = $lc;

if ((not defined $cc) or (not defined $lc)) {
die("Pass country code (or world) and language code as arguments.\n");
die("$usage\nError: Pass country code (or world) and language code as arguments.\n");
}
else {
if (defined $country_codes{$cc}) {
Expand All @@ -128,7 +151,7 @@ ($jsonl_path, $country)
$log->info("finding products", {lc => $lc, cc => $cc, country => $country}) if $log->is_info();

my $jsonl_path = "$BASE_DIRS{PUBLIC_DATA}/openfoodfacts-products.jsonl.gz";
my $products_iter = iter_products_from_jsonl($jsonl_path, $country);
my $products_iter = iter_products_from_jsonl($jsonl_path, $country, $verbose);

$request_ref->{map_options} = $map_options{$cc} || "";
my $map_html = map_of_products($products_iter, $request_ref, $graph_ref);
Expand Down
Binary file not shown.
7 changes: 5 additions & 2 deletions tests/integration/madenearme.t
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,11 @@ my @tests = (
"testid" => "world-en",
"args" => ["world", "en"],
"matched_products" =>
qr/3 products match the search criteria, of which 2 products have a known production place/,
"geopoints" => ['"geo":[43.9753575,2.9912097]', '"geo":[50.3792391,3.0399349]'],
qr/3 products match the search criteria, of which 3 products have a known production place/,
"geopoints" => [
'"geo":[43.9753575,2.9912097]', '"geo":[50.3792391,3.0399349]',
'"geo":[48.71119,10.62904]', '"geo":[48.71119,10.62904]'
],
},
{
"testid" => "fr-fr",
Expand Down

0 comments on commit 998ddb4

Please sign in to comment.