-
-
Notifications
You must be signed in to change notification settings - Fork 40
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #184 from OP-Engineering/oscar/custom-c-files
Custom Tokenizers
- Loading branch information
Showing
28 changed files
with
501 additions
and
67 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -81,4 +81,6 @@ android/gradle/ | |
!.yarn/plugins | ||
!.yarn/releases | ||
!.yarn/sdks | ||
!.yarn/versions | ||
!.yarn/versions | ||
|
||
android/c_sources |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
#include "tokenizers.h" | ||
#include <cctype> | ||
#include <memory> | ||
#include <string> | ||
|
||
namespace opsqlite { | ||
|
||
fts5_api *fts5_api_from_db(sqlite3 *db) { | ||
fts5_api *pRet = 0; | ||
sqlite3_stmt *pStmt = 0; | ||
|
||
if (SQLITE_OK == sqlite3_prepare_v2(db, "SELECT fts5(?1)", -1, &pStmt, 0)) { | ||
sqlite3_bind_pointer(pStmt, 1, (void *)&pRet, "fts5_api_ptr", NULL); | ||
sqlite3_step(pStmt); | ||
} | ||
sqlite3_finalize(pStmt); | ||
return pRet; | ||
} | ||
|
||
class WordTokenizer { | ||
public: | ||
WordTokenizer() = default; | ||
~WordTokenizer() = default; | ||
}; | ||
|
||
// Define `xCreate`, which initializes the tokenizer | ||
int wordTokenizerCreate(void *pUnused, const char **azArg, int nArg, | ||
Fts5Tokenizer **ppOut) { | ||
auto tokenizer = std::make_unique<WordTokenizer>(); | ||
*ppOut = reinterpret_cast<Fts5Tokenizer *>( | ||
tokenizer.release()); // Cast to Fts5Tokenizer* | ||
return SQLITE_OK; | ||
} | ||
|
||
// Define `xDelete`, which frees the tokenizer | ||
void wordTokenizerDelete(Fts5Tokenizer *pTokenizer) { | ||
delete reinterpret_cast<WordTokenizer *>(pTokenizer); | ||
} | ||
|
||
// Define `xTokenize`, which performs the actual tokenization | ||
int wordTokenizerTokenize(Fts5Tokenizer *pTokenizer, void *pCtx, int flags, | ||
const char *pText, int nText, | ||
int (*xToken)(void *, int, const char *, int, int, | ||
int)) { | ||
int start = 0; | ||
int i = 0; | ||
|
||
while (i <= nText) { | ||
if (i == nText || !std::isalnum(static_cast<unsigned char>(pText[i]))) { | ||
if (start < i) { // Found a token | ||
int rc = xToken(pCtx, 0, pText + start, i - start, start, i); | ||
if (rc != SQLITE_OK) | ||
return rc; | ||
} | ||
start = i + 1; | ||
} | ||
i++; | ||
} | ||
return SQLITE_OK; | ||
} | ||
|
||
int opsqlite_wordtokenizer_init(sqlite3 *db, char **error, | ||
sqlite3_api_routines const *api) { | ||
fts5_tokenizer wordtokenizer = {wordTokenizerCreate, wordTokenizerDelete, | ||
wordTokenizerTokenize}; | ||
|
||
fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db); | ||
if (ftsApi == NULL) | ||
return SQLITE_ERROR; | ||
|
||
return ftsApi->xCreateTokenizer(ftsApi, "wordtokenizer", NULL, &wordtokenizer, | ||
NULL); | ||
} | ||
|
||
int opsqlite_porter_init(sqlite3 *db, char **error, | ||
sqlite3_api_routines const *api) { | ||
fts5_tokenizer porter_tokenizer = {wordTokenizerCreate, wordTokenizerDelete, | ||
wordTokenizerTokenize}; | ||
|
||
fts5_api *ftsApi = (fts5_api *)fts5_api_from_db(db); | ||
if (ftsApi == nullptr) | ||
return SQLITE_ERROR; | ||
|
||
return ftsApi->xCreateTokenizer(ftsApi, "portertokenizer", NULL, | ||
&porter_tokenizer, NULL); | ||
} | ||
|
||
} // namespace opsqlite |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
#ifndef TOKENIZERS_H | ||
#define TOKENIZERS_H | ||
|
||
#define TOKENIZER_LIST opsqlite_wordtokenizer_init(db,&errMsg,nullptr);opsqlite_porter_init(db,&errMsg,nullptr); | ||
|
||
#include "sqlite3.h" | ||
|
||
namespace opsqlite { | ||
|
||
int opsqlite_wordtokenizer_init(sqlite3 *db, char **error, sqlite3_api_routines const *api); | ||
int opsqlite_porter_init(sqlite3 *db, char **error, sqlite3_api_routines const *api); | ||
|
||
} // namespace opsqlite | ||
|
||
#endif // TOKENIZERS_H |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.