Skip to content

Commit

Permalink
Merge pull request #15 from benniekrijger/feature-keyword-extraction
Browse files Browse the repository at this point in the history
Added keyword extraction feature
  • Loading branch information
fieg committed Sep 17, 2014
2 parents 28a2ca8 + 8c618a7 commit d672d38
Show file tree
Hide file tree
Showing 2 changed files with 163 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/FM/ClassificationBundle/Extractor/ExtractorInterfac.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
<?php

namespace FM\ClassificationBundle\Extractor;

interface ExtractorInterface
{
public function extract($input);
}
155 changes: 155 additions & 0 deletions src/FM/ClassificationBundle/Extractor/TFIDFExtractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
<?php

namespace FM\ClassificationBundle\Extractor;

use FM\ClassificationBundle\DataSource\DataSourceInterface;
use FM\ClassificationBundle\Tokenizer\TokenizerInterface;

class TFIDFExtractor implements ExtractorInterface
{
/**
* @var int
*/
protected $docCount = 0;

/**
* @var array
*/
protected $tokenDocCount = 0;

/**
* @var array
*/
protected $tokens;

/**
* @var array
*/
protected $maxTokenFrequency;

/**
* @var TokenizerInterface
*/
protected $tokenizer;

/**
* @param TokenizerInterface $tokenizer
*/
public function __construct(TokenizerInterface $tokenizer)
{
$this->tokenizer = $tokenizer;
}

/**
* @param DataSourceInterface $dataSource DataSourceInterface $dataSource
*
* @throws \RuntimeException
*
* @return bool
*/
public function train(DataSourceInterface $dataSource)
{
$this->docCount = 0;
$this->tokenDocCount = [];
$this->tokens = [];
$this->maxTokenFrequency = [];

foreach ($dataSource as $data) {
if (!is_string($data)) {
throw new \RuntimeException('Data source should only contain strings');
}

$this->trainOne($data);
}

return $this->docCount > 0;
}

/**
* @param string $data
*/
protected function trainOne($data)
{
$tokens = $this->tokenizer->tokenize($data);

$tokenFrequency = [];
array_map(function($token) use (&$tokenFrequency) {
// update tokens
if (!isset($this->tokens[$token])) {
$this->tokens[$token] = 0;
}
$this->tokens[$token]++;

// update token doc count
if (!isset($tokenFrequency[$token])) {
if (!isset($this->tokenDocCount[$token])) {
$this->tokenDocCount[$token] = 0;
}
$this->tokenDocCount[$token]++;
$tokenFrequency[$token] = 0;
}

$tokenFrequency[$token]++;
}, $tokens);

foreach ($tokenFrequency as $token => $frequency) {
// update max token frequency
if (isset($this->maxTokenFrequency[$token]) && $frequency > $this->maxTokenFrequency[$token]) {
$this->maxTokenFrequency[$token] = $frequency;
}
}

// update doc count
$this->docCount++;
}

/**
* @inheritdoc
*/
public function extract($input)
{
if ($this->docCount < 1) {
throw new \RuntimeException('Unable to extract, extractor has no data!');
}

$tokens = $this->tokenizer->tokenize($input);
$tokenCount = count($tokens);

// calculate token frequency within the document
$tokenFrequency = [];
array_map(function($token) use (&$tokenFrequency) {
if (!isset($tokenFrequency[$token])) {
$tokenFrequency[$token] = 0;
}
$tokenFrequency[$token]++;
}, $tokens);

// calculate tfidf foreach token
$classifiedTokens = [];
array_walk($tokenFrequency, function($frequency, $token) use ($tokenCount, &$classifiedTokens){
// tfidf
$tokenDocCount = 0;
if (isset($this->tokenDocCount[$token])) {
$tokenDocCount = $this->tokenDocCount[$token];
}

$maxSeenFrequency = 0;
if (isset($this->maxTokenFrequency[$token])) {
$maxSeenFrequency = $this->maxTokenFrequency[$token];
}

// $tfidf = ($frequency/(1 + $tokenCount)) * log($this->docCount/(1 + $tokenDocCount));
$tfidf = 0.5 + ((0.5 * $frequency)/(1 + $maxSeenFrequency)) * log($this->docCount/(1 + $tokenDocCount));

if (ctype_upper($token)) {
$tfidf += 2;
} elseif (ctype_upper($token[0])) {
$tfidf += 1;
}

$classifiedTokens[$token] = $tfidf;
});

return $classifiedTokens;
}
}

0 comments on commit d672d38

Please sign in to comment.