Skip to content

Commit

Permalink
Hash-based caching (#11)
Browse files Browse the repository at this point in the history
* Hash-based caching
simplepie#401
FreshRSS/FreshRSS@9aab83a
FreshRSS/FreshRSS@00127f0

* Backport fix
FreshRSS/FreshRSS#6723

* A few fixes

* Reduce changes

* Reduce changes

* Relax some tests

* Fix comment

* Fix a few tests

* PHP 7.2 compatibility

* Behaviour fixes

* Simplification

* Comment

* Fix tests

* Remove debug logs

* Minor comment

* Fix type mess with $this->data['headers']
  • Loading branch information
Alkarex authored Sep 8, 2024
1 parent a2f4d55 commit a169376
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 19 deletions.
9 changes: 5 additions & 4 deletions src/Cache/BaseDataCache.php
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,11 @@ public function get_data(string $key, $default = null)
return $default;
}

// ignore data if internal cache expiration time is expired
if ($data['__cache_expiration_time'] < time()) {
return $default;
}
// FreshRSS commented out, to allow HTTP 304
// // ignore data if internal cache expiration time is expired
// if ($data['__cache_expiration_time'] < time()) {
// return $default;
// }

// remove internal cache expiration time
unset($data['__cache_expiration_time']);
Expand Down
59 changes: 55 additions & 4 deletions src/SimplePie.php
Original file line number Diff line number Diff line change
Expand Up @@ -1634,14 +1634,48 @@ public function enable_exceptions(bool $enable = true)
$this->enable_exceptions = $enable;
}

/**
* Computes a hash of the raw feed content,
* after having cleaned it from noisy elements such as statistics or comments.
* FreshRSS
* @return string $rss A hash of the cleaned content, or empty string in case of error.
*/
function clean_hash(string $rss): string
{
if ($rss === '') {
return '';
}
//Process by chunks not to use too much memory
if (($stream = fopen('php://temp', 'r+'))
&& fwrite($stream, $rss)
&& rewind($stream)
) {
$ctx = hash_init('sha1');
while ($stream_data = fread($stream, 1048576)) {
hash_update(
$ctx, preg_replace(
[
'#<(lastBuildDate|pubDate|updated|feedDate|dc:date|slash:comments)>[^<]+</\\1>#',
'#<(media:starRating|media:statistics) [^/<>]+/>#',
'#<!--.+?-->#s',
], '', $stream_data
)
);
}
fclose($stream);
return hash_final($ctx);
}
return '';
}

/**
* Initialize the feed object
*
* This is what makes everything happen. Period. This is where all of the
* configuration options get processed, feeds are fetched, cached, and
* parsed, and all of that other good stuff.
*
* @return bool True if successful, false otherwise
* @return bool|int positive integer with modification time if using cache, boolean true if otherwise successful, false otherwise // FreshRSS
*/
public function init()
{
Expand Down Expand Up @@ -1729,7 +1763,7 @@ public function init()

// Fetch the data into $this->raw_data
if (($fetched = $this->fetch_data($cache)) === true) {
return true;
return $this->data['mtime'] ?? true; // FreshRSS
} elseif ($fetched === false) {
return false;
}
Expand Down Expand Up @@ -1803,6 +1837,8 @@ public function init()
$this->data['headers'] = $headers;
}
$this->data['build'] = Misc::get_build();
$this->data['hash'] = $this->data['hash'] ?? $this->clean_hash($this->raw_data); // FreshRSS
$this->data['mtime'] = time(); // FreshRSS

// Cache the file if caching is enabled
$this->data['cache_expiration_time'] = $this->cache_duration + time();
Expand Down Expand Up @@ -1850,6 +1886,7 @@ public function init()
*
* @param Base|DataCache|false $cache Cache handler, or false to not load from the cache
* @return array{array<string, string>, string}|array{}|bool Returns true if the data was loaded from the cache, or an array of HTTP headers and sniffed type
* @phpstan-impure
*/
protected function fetch_data(&$cache)
{
Expand Down Expand Up @@ -1901,7 +1938,8 @@ protected function fetch_data(&$cache)
$this->data = [];
}
// Check if the cache has been updated
elseif (isset($this->data['cache_expiration_time']) && $this->data['cache_expiration_time'] > time()) {
// elseif (isset($this->data['cache_expiration_time']) && $this->data['cache_expiration_time'] > time()) { // FreshRSS
elseif (empty($this->data['mtime']) || $this->data['mtime'] + $this->cache_duration <= time()) {
// Want to know if we tried to send last-modified and/or etag headers
// when requesting this file. (Note that it's up to the file to
// support this, but we don't always send the headers either.)
Expand Down Expand Up @@ -1942,6 +1980,18 @@ protected function fetch_data(&$cache)
return true;
}
}
if (isset($file)) { // FreshRSS
$hash = $this->clean_hash($file->get_body_content());
if (($this->data['hash'] ?? null) === $hash) {
$this->data['headers'] = array_map(function (array $values): string {
return implode(',', $values);
}, $file->get_headers());
$cache->set_data($cacheKey, $this->data, $this->cache_duration);
return true; // Content unchanged even though server did not send a 304
} else {
$this->data['hash'] = $hash;
}
}
}
// If the cache is still valid, just return true
else {
Expand Down Expand Up @@ -2069,6 +2119,8 @@ protected function fetch_data(&$cache)
'feed_url' => $file->get_final_requested_uri(),
'build' => Misc::get_build(),
'cache_expiration_time' => $this->cache_duration + time(),
'hash' => empty($hash) ? $this->clean_hash($file->get_body_content()) : $hash, // FreshRSS
'mtime' => time(), // FreshRSS
];

if (!$cache->set_data($cacheKey, $this->data, $this->cache_duration)) {
Expand All @@ -2081,7 +2133,6 @@ protected function fetch_data(&$cache)
}

$this->raw_data = $file->get_body_content();
$this->raw_data = trim($this->raw_data);
$this->permanent_url = $file->get_permanent_uri();

$headers = [];
Expand Down
3 changes: 2 additions & 1 deletion tests/Integration/CachingTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ public function testInitWithDifferentCacheStateCallsCacheCorrectly(
$expectedDataWritten['cache_expiration_time'] = $writtenData['cache_expiration_time'];
}

$this->assertSame($expectedDataWritten, $writtenData);
// Test that $expectedDataWritten is a subset of $writtenData
$this->assertEmpty(array_udiff_assoc($expectedDataWritten, $writtenData, function ($a, $b) { return json_encode($a) <=> json_encode($b); })); // FreshRSS
}

/**
Expand Down
21 changes: 11 additions & 10 deletions tests/Unit/Cache/BaseDataCacheTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,20 @@ public function testGetDataWithCacheMissReturnsDefault(): void
$this->assertSame($default, $cache->get_data($key, $default));
}

public function testGetDataWithCacheExpiredReturnsDefault(): void
{
$key = 'name';
$cachedValue = ['__cache_expiration_time' => 0];
$default = new stdClass();
// FreshRSS commented out, to allow HTTP 304
// public function testGetDataWithCacheExpiredReturnsDefault(): void
// {
// $key = 'name';
// $cachedValue = ['__cache_expiration_time' => 0];
// $default = new stdClass();

$baseCache = $this->createMock(Base::class);
$baseCache->expects($this->once())->method('load')->willReturn($cachedValue);
// $baseCache = $this->createMock(Base::class);
// $baseCache->expects($this->once())->method('load')->willReturn($cachedValue);

$cache = new BaseDataCache($baseCache);
// $cache = new BaseDataCache($baseCache);

$this->assertSame($default, $cache->get_data($key, $default));
}
// $this->assertSame($default, $cache->get_data($key, $default));
// }

public function testGetDataWithCacheCorruptionReturnsDefault(): void
{
Expand Down

0 comments on commit a169376

Please sign in to comment.