Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Reindex crawler for files (#28) #36

Merged
merged 2 commits into from
Jun 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,27 @@ SilverStripe\CMS\Model\SiteTree:
- Ichaber\SSSwiftype\Extensions\SwiftypeMetaTagContentExtension
```

### Indexing Files
If you are using the Swiftype Crawler, and would like to add "re-crawl" actions after your Files un/publish, you can
apply `SwiftypeFileCrawlerExtension` to `File` (or another model of your choice).

```yml
SilverStripe\CMS\Model\File:
extensions:
- Ichaber\SSSwiftype\Extensions\SwiftypeFileCrawlerExtension
```

There is also a config to allow certain file types from being indexed/reindexed.
```yml
Ichaber\SSSwiftype\Extensions\SwiftypeFileCrawlerExtension:
reindex_allowed_extensions:
- pdf
```
> NB: by default this config does not set any extensions to be indexed.

## Indexing custom DataObjects
If you want to index your own custom DataObjects, simply create your own crawler extension by extending

## Adding your own Meta Tags

You can easily add your own classes to your objects (see [Installation](#Installation)).
Expand Down
292 changes: 292 additions & 0 deletions src/Extensions/AbstractSwiftypeCrawlerExtension.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,292 @@
<?php

namespace Ichaber\SSSwiftype\Extensions;

use Ichaber\SSSwiftype\Service\SwiftypeCrawler;
use SilverStripe\ORM\DataExtension;
use SilverStripe\ORM\DataObject;
use SilverStripe\Control\Director;
use SilverStripe\Core\Config\Config;
use SilverStripe\Versioned\Versioned;

/**
* @method DataObject|$this getOwner()
*/
abstract class AbstractSwiftypeCrawlerExtension extends DataExtension
{
/**
* Urls to crawl
*
* array keyed by getOwnerKey
*/
private array $urlsToCrawl = [];

public function setUrlsToCrawl(array $urls)
{
$this->urlsToCrawl = $urls;
}

public function getUrlsToCrawl(): array
{
return $this->urlsToCrawl;
}

/**
* We need to collate Urls before we write, just in case an author has changed the File's name (URL). If they
* have, then we need to request Swiftype to reindex both the old Url (which should then be marked by Swiftype
* as a 404), and the new Url
*/
public function onBeforeWrite(): void
{
$this->collateUrls();
}

/**
* After a publish has occurred, we can collate and process immediately (no need to split things out like during
* an unpublish)
*/
public function onAfterPublish(&$original): void
{
$this->collateUrls();
$this->processCollatedUrls();

// Check to see if the clearing of cache has been disabled (useful for unit testing, or any other reason you
// might have to disable it)
$clearCacheDisabled = Config::inst()->get(static::class, 'clear_cache_disabled');

if ($clearCacheDisabled) {
return;
}

// It's important that we clear the cache after we have finished requesting reindex from Swiftype
$this->clearCacheSingle();
}

/**
* We need to collate the Urls to be purged *before* we complete the unpublish action (otherwise, the LIVE Urls
* will no longer be available, since the page is now unpublished)
*/
public function onBeforeUnpublish(): void
{
$this->collateUrls();
}

/**
* After the unpublish has completed, we can now request Swiftype to reindex the Urls that we collated
*/
public function onAfterUnpublish(): void
{
$this->processCollatedUrls();

// Check to see if the clearing of cache has been disabled (useful for unit testing, or any other reason you
// might have to disable it)
$clearCacheDisabled = Config::inst()->get(static::class, 'clear_cache_disabled');

if ($clearCacheDisabled) {
return;
}

// It's important that we clear the cache after we have finished requesting reindex from Swiftype
$this->clearCacheSingle();
}

/**
* You may need to clear the cache at some point during your particular process
*
* Reset all Urls for any/all objects that might be in the cache (keeping in mind that Extensions are singleton,
* so the UrlsToCache could be accessed via singleton and it could contain Urls for many owner objects)
*
* We don't use flushCache (which is called from DataObject) because this is called between write and un/publish,
* and we need our cache to persist through these states
*/
public function clearCacheAll(): void
{
$this->setUrlsToCrawl([]);
}

/**
* You may need to clear the cache at some point during your particular process
*
* Reset only the Urls related to this particular owner object (keeping in mind that Extensions are singleton,
* so the UrlsToCache could be accessed via singleton and it could contain Urls for many owner objects)
*
* We don't use flushCache (which is called from DataObject) because this is called between write and un/publish,
* and we need our cache to persist through these states
*/
public function clearCacheSingle(): void
{
$urls = $this->getUrlsToCrawl();
$key = $this->getOwnerKey();

// Nothing for us to do here
if ($key === null) {
return;
}

// Nothing for us to do here
if (!array_key_exists($key, $urls)) {
return;
}

// Remove this key and it's Urls
unset($urls[$key]);

$this->setUrlsToCrawl($urls);
}

/**
* Collate Urls to crawl
*
* Extensions are singleton, so we use the owner key to make sure that we're only processing Urls directly related
* to the desired record
*
* You might need to collate more than one URL per Page (maybe you're using Fluent or another translation module).
* This is the method you will want to override in order to add that additional logic
*/
public function collateUrls(): void
{
if (!$this->recordCanBeIndexed()) {
return;
}

// Grab any existing Urls so that we can add to it
$urls = $this->getUrlsToCrawl();

// Set us to a LIVE stage/reading_mode
$this->withVersionContext(function () use (&$urls) {
$key = $this->getOwnerKey();

// We can't do anything if we don't have a key to use
if ($key === null) {
return;
}

// Create a new container for this key
if (!array_key_exists($key, $urls)) {
$urls[$key] = [];
}

// Grab the absolute live link without ?stage=Live appended
$link = $this->getOwnerLink();

// If this record is not published, or we're unable to get a "Live Link" (for whatever reason), then there
// is nothing more we can do here
if (!$link) {
return;
}

// Nothing for us to do here, the Link is already being tracked
if (in_array($link, $urls[$key])) {
return;
}

// Add our base URL to this key
$urls[$key][] = $link;
});

// Update the Urls we have stored for indexing
$this->setUrlsToCrawl($urls);
}

/**
* Send requests to Swiftype to reindex each of the Urls that we have previously collated
*/
protected function processCollatedUrls(): void
{
// Fetch the Urls that we need to reindex
$key = $this->getOwnerKey();

// We can't do anything if we don't have a key to process
if ($key === null) {
return;
}

$urls = $this->getUrlsToCrawl();

// There is nothing for us to do here if there are no Urls
if (count(array_keys($urls)) === 0) {
return;
}

// There are no Urls for this particular key
if (!array_key_exists($key, $urls)) {
return;
}

// Force the reindexing of each URL we collated
foreach ($urls[$key] as $url) {
$this->forceSwiftypeIndex($url);
}
}

protected function forceSwiftypeIndex(string $updateUrl): bool
{
// We don't reindex dev environments
if (Director::isDev()) {
return true;
}

$crawler = SwiftypeCrawler::create();

return $crawler->send($updateUrl);
}

protected function getOwnerKey(): ?string
{
$owner = $this->getOwner();

// Can't generate a key if the owner has not yet been written to the DB
if (!$owner->isInDB()) {
return null;
}

$key = str_replace('\\', '', $owner->ClassName . $owner->ID);

return $key;
}

/**
* Return the absolute link to the record, if the record is versioned return the live link
*/
abstract protected function getOwnerLink(): ?string;

/**
* Return true if a record can be indexed
*/
abstract protected function recordCanBeIndexed(): bool;

/**
* Sets the version context to Live as that's what crawlers will (normally) see
*
* The main function is to suppress the ?stage=Live querystring. LeftAndMain will set the default
* reading mode to 'DRAFT' when initialising so to counter this we need to re-set the default
* reading mode back to LIVE
*/
private function withVersionContext(callable $callback): void
{
Versioned::withVersionedMode(static function () use ($callback) {
// Grab our current stage and reading mode
$originalDefaultReadingMode = Versioned::get_default_reading_mode();
$originalReadingMode = Versioned::get_reading_mode();
$originalStage = Versioned::get_stage();

// Set our stage and reading mode to LIVE
Versioned::set_default_reading_mode('Stage.' . Versioned::LIVE);
Versioned::set_reading_mode('Stage.' . Versioned::LIVE);
Versioned::set_stage(Versioned::LIVE);

// Process whatever callback was provided
$callback();

// Set us back to the original stage and reading mode
if ($originalReadingMode) {
Versioned::set_default_reading_mode($originalDefaultReadingMode);
Versioned::set_reading_mode($originalReadingMode);
}

if ($originalStage) {
Versioned::set_stage($originalStage);
}
});
}
}
41 changes: 41 additions & 0 deletions src/Extensions/SwiftypeFileCrawlerExtension.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
<?php

namespace Ichaber\SSSwiftype\Extensions;

use SilverStripe\Assets\File;
use SilverStripe\Versioned\Versioned;

/**
* @method File|$this getOwner()
*/
class SwiftypeFileCrawlerExtension extends AbstractSwiftypeCrawlerExtension
{
/**
* @var array List of allowed file extensions to be reindexed.
*/
private static array $reindex_allowed_extensions = [];

protected function getOwnerLink(): ?string
{
/** @var File $live */
$live = Versioned::get_by_stage(File::class, Versioned::LIVE)->byID($this->getOwner()->ID);

if ($live) {
return $live->AbsoluteLink();
}

return null;
}

/**
* Check our file types allowlist since we don't want to index files that aren't required in the index
* e.g. image files.
*/
protected function recordCanBeIndexed(): bool
{
// only reindex file types we need.
$fileType = File::get_file_extension($this->getOwner()->Filename);

return in_array($fileType, $this->getOwner()->config()->get('reindex_allowed_extensions'), true);
}
}
Loading