Skip to content

Commit

Permalink
Merge pull request #7377 from piwik/7181_isolated_archive_purging
Browse files Browse the repository at this point in the history
refactor archive purging for clarity and resilience.
  • Loading branch information
Matthieu Aubry committed Mar 13, 2015
2 parents f95d109 + f19f7fe commit ac87933
Show file tree
Hide file tree
Showing 31 changed files with 1,708 additions and 386 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ This is a changelog for Piwik platform developers. All changes for our HTTP API'
* `isIpInRange()`
* `getHostByAddr()`

### New commands
* There is now a command `core:purge-old-archive-data` that can be used to manually purge temporary, error-ed and invalidated archives from one or more archive tables.

## Piwik 2.11.0

### Breaking Changes
Expand Down
2 changes: 1 addition & 1 deletion core/Archive.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

use Piwik\Archive\Parameters;
use Piwik\ArchiveProcessor\Rules;
use Piwik\DataAccess\ArchiveInvalidator;
use Piwik\Archive\ArchiveInvalidator;
use Piwik\DataAccess\ArchiveSelector;
use Piwik\Period\Factory as PeriodFactory;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,28 +7,46 @@
*
*/

namespace Piwik\DataAccess;
namespace Piwik\Archive;

use Piwik\CronArchive\SitesToReprocessDistributedList;
use Piwik\DataAccess\ArchiveTableCreator;
use Piwik\DataAccess\Model;
use Piwik\Date;
use Piwik\Db;
use Piwik\Option;
use Piwik\Plugins\CoreAdminHome\Tasks\ArchivesToPurgeDistributedList;
use Piwik\Plugins\PrivacyManager\PrivacyManager;
use Piwik\Period;
use Piwik\Period\Week;
use Piwik\Plugins\SitesManager\Model as SitesManagerModel;
use Piwik\Site;

/**
* Marks archives as Invalidated by setting the done flag to a special value (see Model->updateArchiveAsInvalidated)
* Service that can be used to invalidate archives or add archive references to a list so they will
* be invalidated later.
*
* Invalidated archives can still be selected and displayed in UI and API (until they are reprocessed by core:archive)
* Archives are put in an "invalidated" state by setting the done flag to `ArchiveWriter::DONE_INVALIDATED`.
* This class also adds the archive's associated site to the a distributed list and adding the archive's year month to another
* distributed list.
*
* The invalidated archives will be deleted by ArchivePurger
* CronArchive will reprocess the archive data for all sites in the first list, and a scheduled task
* will purge the old, invalidated data in archive tables identified by the second list.
*
* @package Piwik\DataAccess
* Until CronArchive, or browser triggered archiving, re-processes data for an invalidated archive, the invalidated
* archive data will still be displayed in the UI and API.
*
* ### Deferred Invalidation
*
* Invalidating archives means running queries on one or more archive tables. In some situations, like during
* tracking, this is not desired. In such cases, archive references can be added to a list via the
* rememberToInvalidateArchivedReportsLater method, which will add the reference to a distributed list
*
* Later, during Piwik's normal execution, the list will be read and every archive it references will
* be invalidated.
*/
class ArchiveInvalidator {

class ArchiveInvalidator
{
private $warningDates = array();
private $processedDates = array();
private $minimumDateWithLogs = false;
Expand Down Expand Up @@ -317,9 +335,11 @@ private function persistInvalidatedArchives(array $idSites, $datesByMonth)
$yearMonths = array_keys($datesByMonth);
$yearMonths = array_unique($yearMonths);

$store = new InvalidatedReports();
$store->addInvalidatedSitesToReprocess($idSites);
$store->addSitesToPurgeForYearMonths($idSites, $yearMonths);
$store = new SitesToReprocessDistributedList();
$store->add($idSites);

$archivesToPurge = new ArchivesToPurgeDistributedList();
$archivesToPurge->add($yearMonths);
}

private static function getModel()
Expand Down
227 changes: 227 additions & 0 deletions core/Archive/ArchivePurger.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,227 @@
<?php
/**
* Piwik - free/libre analytics platform
*
* @link http://piwik.org
* @license http://www.gnu.org/licenses/gpl-3.0.html GPL v3 or later
*
*/
namespace Piwik\Archive;

use Piwik\ArchiveProcessor\Rules;
use Piwik\Config;
use Piwik\DataAccess\ArchiveTableCreator;
use Piwik\DataAccess\Model;
use Piwik\Date;
use Piwik\Db;
use Piwik\Log;
use Piwik\Piwik;

/**
* Service that purges temporary, error-ed, invalid and custom range archives from archive tables.
*
* Temporary archives are purged if they were archived before a specific time. The time is dependent
* on whether browser triggered archiving is enabled or not.
*
* Error-ed archives are purged w/o constraint.
*
* Invalid archives are purged if a new, valid, archive exists w/ the same site, date, period combination.
* Archives are marked as invalid via Piwik\Archive\ArchiveInvalidator.
*/
class ArchivePurger
{
/**
* @var Model
*/
private $model;

/**
* Date threshold for purging custom range archives. Archives that are older than this date
* are purged unconditionally from the requested archive table.
*
* @var Date
*/
private $purgeCustomRangesOlderThan;

/**
* Date to use for 'yesterday'. Exists so tests can override this value.
*
* @var Date
*/
private $yesterday;

/**
* Date to use for 'today'. Exists so tests can override this value.
*
* @var $today
*/
private $today;

/**
* Date to use for 'now'. Exists so tests can override this value.
*
* @var int
*/
private $now;

public function __construct(Model $model = null, Date $purgeCustomRangesOlderThan = null)
{
$this->model = $model ?: new Model();

$this->purgeCustomRangesOlderThan = $purgeCustomRangesOlderThan ?: self::getDefaultCustomRangeToPurgeAgeThreshold();

$this->yesterday = Date::factory('yesterday');
$this->today = Date::factory('today');
$this->now = time();
}

/**
* Purge all invalidate archives for whom there are newer, valid archives from the archive
* table that stores data for `$date`.
*
* @param Date $date The date identifying the archive table.
*/
public function purgeInvalidatedArchivesFrom(Date $date)
{
$numericTable = ArchiveTableCreator::getNumericTable($date);

// we don't want to do an INNER JOIN on every row in a archive table that can potentially have tens to hundreds of thousands of rows,
// so we first look for sites w/ invalidated archives, and use this as a constraint in getInvalidatedArchiveIdsSafeToDelete() below.
// the constraint will hit an INDEX and speed up the inner join that happens in getInvalidatedArchiveIdsSafeToDelete().
$idSites = $this->model->getSitesWithInvalidatedArchive($numericTable);
if (empty($idSites)) {
return;
}

$archiveIds = $this->model->getInvalidatedArchiveIdsSafeToDelete($numericTable, $idSites);
if (empty($archiveIds)) {
return;
}

$this->deleteArchiveIds($date, $archiveIds);
}

/**
* Removes the outdated archives for the given month.
* (meaning they are marked with a done flag of ArchiveWriter::DONE_OK_TEMPORARY or ArchiveWriter::DONE_ERROR)
*
* @param Date $dateStart Only the month will be used
*/
public function purgeOutdatedArchives(Date $dateStart)
{
$purgeArchivesOlderThan = $this->getOldestTemporaryArchiveToKeepThreshold();

$idArchivesToDelete = $this->getOutdatedArchiveIds($dateStart, $purgeArchivesOlderThan);
if (!empty($idArchivesToDelete)) {
$this->deleteArchiveIds($dateStart, $idArchivesToDelete);
}

Log::debug("Purging temporary archives: done [ purged archives older than %s in %s ] [Deleted IDs: %s]",
$purgeArchivesOlderThan,
$dateStart->toString("Y-m"),
implode(',', $idArchivesToDelete));
}

protected function getOutdatedArchiveIds(Date $date, $purgeArchivesOlderThan)
{
$archiveTable = ArchiveTableCreator::getNumericTable($date);

$result = $this->model->getTemporaryArchivesOlderThan($archiveTable, $purgeArchivesOlderThan);

$idArchivesToDelete = array();
if (!empty($result)) {
foreach ($result as $row) {
$idArchivesToDelete[] = $row['idarchive'];
}
}

return $idArchivesToDelete;
}

/**
* Deleting "Custom Date Range" reports after 1 day, since they can be re-processed and would take up un-necessary space.
*
* @param $date Date
*/
public function purgeArchivesWithPeriodRange(Date $date)
{
$numericTable = ArchiveTableCreator::getNumericTable($date);
$blobTable = ArchiveTableCreator::getBlobTable($date);

$this->model->deleteArchivesWithPeriod($numericTable, $blobTable, Piwik::$idPeriods['range'], $this->purgeCustomRangesOlderThan);

Log::debug("Purging Custom Range archives: done [ purged archives older than %s from %s / blob ]",
$this->purgeCustomRangesOlderThan, $numericTable);
}

/**
* Deletes by batches Archive IDs in the specified month,
*
* @param Date $date
* @param $idArchivesToDelete
*/
protected function deleteArchiveIds(Date $date, $idArchivesToDelete)
{
$batches = array_chunk($idArchivesToDelete, 1000);
$numericTable = ArchiveTableCreator::getNumericTable($date);
$blobTable = ArchiveTableCreator::getBlobTable($date);

foreach ($batches as $idsToDelete) {
$this->model->deleteArchiveIds($numericTable, $blobTable, $idsToDelete);
}
}

/**
* Returns a timestamp indicating outdated archives older than this timestamp (processed before) can be purged.
*
* @return int|bool Outdated archives older than this timestamp should be purged
*/
protected function getOldestTemporaryArchiveToKeepThreshold()
{
$temporaryArchivingTimeout = Rules::getTodayArchiveTimeToLive();
if (Rules::isBrowserTriggerEnabled()) {
// If Browser Archiving is enabled, it is likely there are many more temporary archives
// We delete more often which is safe, since reports are re-processed on demand
return Date::factory($this->now - 2 * $temporaryArchivingTimeout)->getDateTime();
}

// If cron core:archive command is building the reports, we should keep all temporary reports from today
return $this->yesterday->getDateTime();
}

private static function getDefaultCustomRangeToPurgeAgeThreshold()
{
$daysRangesValid = Config::getInstance()->General['purge_date_range_archives_after_X_days'];
return Date::factory('today')->subDay($daysRangesValid)->getDateTime();
}

/**
* For tests.
*
* @param Date $yesterday
*/
public function setYesterdayDate(Date $yesterday)
{
$this->yesterday = $yesterday;
}

/**
* For tests.
*
* @param Date $today
*/
public function setTodayDate(Date $today)
{
$this->today = $today;
}

/**
* For tests.
*
* @param int $now
*/
public function setNow($now)
{
$this->now = $now;
}
}
35 changes: 1 addition & 34 deletions core/ArchiveProcessor/Rules.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

use Exception;
use Piwik\Config;
use Piwik\Container\StaticContainer;
use Piwik\DataAccess\ArchiveWriter;
use Piwik\Date;
use Piwik\Log;
Expand Down Expand Up @@ -113,37 +112,6 @@ public static function getDoneFlags(array $plugins, Segment $segment)
return $doneFlags;
}

/**
* Returns false if we should not purge data for this month,
* or returns a timestamp indicating outdated archives older than this timestamp (processed before) can be purged.
*
* Note: when calling this function it is assumed that the callee will purge the outdated archives afterwards.
*
* @param \Piwik\Date $date
* @return int|bool Outdated archives older than this timestamp should be purged
*/
public static function shouldPurgeOutdatedArchives(Date $date)
{
// we only delete archives if we are able to process them, otherwise, the browser might process reports
// when &segment= is specified (or custom date range) and would below, delete temporary archives that the
// browser is not able to process until next cron run (which could be more than 1 hour away)
if (! self::isRequestAuthorizedToArchive()){
Log::info("Purging temporary archives: skipped (no authorization)");
return false;
}

$temporaryArchivingTimeout = self::getTodayArchiveTimeToLive();

if (self::isBrowserTriggerEnabled()) {
// If Browser Archiving is enabled, it is likely there are many more temporary archives
// We delete more often which is safe, since reports are re-processed on demand
return Date::factory(time() - 2 * $temporaryArchivingTimeout)->getDateTime();
}

// If cron core:archive command is building the reports, we should keep all temporary reports from today
return Date::factory('yesterday')->getDateTime();
}

public static function getMinTimeProcessedForTemporaryArchive(
Date $dateStart, \Piwik\Period $period, Segment $segment, Site $site)
{
Expand Down Expand Up @@ -309,5 +277,4 @@ public static function getSelectableDoneFlagValues()

return $possibleValues;
}

}
}
Loading

0 comments on commit ac87933

Please sign in to comment.