Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
55da92e
New unittest Suggestion.autoCompletionAndSpellingCorrection
veloman-yunkan May 15, 2025
5d52188
Limit on suggestion count in EXPECT_SUGGESTION_RESULTS()
veloman-yunkan May 16, 2025
391ca59
Added result count limit to SuggestionSearcher::suggest()
veloman-yunkan May 16, 2025
4b1728e
Made getAutocompletionResults() stub work as intended
veloman-yunkan May 16, 2025
8472ffb
SuggestionSearch::getSmartSuggestions()
veloman-yunkan May 26, 2025
1d683e6
"Implemented" SuggestionSearch::getAutocompletionSuggestions()
veloman-yunkan May 26, 2025
bb70480
Richer Suggestion.autoCompletionAndSpellingCorrection unit-test
veloman-yunkan May 26, 2025
f9196d5
Autocompletion handles multiword queries
veloman-yunkan May 26, 2025
3598328
Implemented getTermCompletions()
veloman-yunkan May 26, 2025
30a463b
Renamed a unit-test
veloman-yunkan May 29, 2025
1f1c2e0
New unit-test Suggestion.autocompletionSuggestions
veloman-yunkan May 29, 2025
a2d21e5
No more fake entries in autocompletion suggestions
veloman-yunkan May 29, 2025
347e925
Made the autocompletion query case-insensitive
veloman-yunkan May 29, 2025
f35b542
Completions require a prefix of at least two letters
veloman-yunkan May 31, 2025
6cebde6
Enter SuggestionSearch::getSpellingSuggestions()
veloman-yunkan May 30, 2025
b455a6e
Proof-of-concept implementation of SpellingsDB
veloman-yunkan May 30, 2025
a94111a
Spelling corrections are included in smart suggestions
veloman-yunkan May 30, 2025
966b2e5
SuggestionDataBase::getAllSuggestionTerms()
veloman-yunkan May 31, 2025
2da49eb
SuggestionDataBase::getSpellingCorrections()
veloman-yunkan Jun 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 53 additions & 4 deletions include/zim/suggestion.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class LIBZIM_API SuggestionSearcher
* The search is made on the archive under the SuggestionSearcher.
*
* @param query The SuggestionQuery to search.
*
* If the estimated count of title suggestions would exceed the specified
* limit, automcompletion suggestions are returned instead.
*/
SuggestionSearch suggest(const std::string& query);

Expand All @@ -97,12 +100,18 @@ class LIBZIM_API SuggestionSearcher
*/
class LIBZIM_API SuggestionSearch
{
public:
public: // types
typedef std::vector<SuggestionItem> Results;

public: // functions
SuggestionSearch(SuggestionSearch&& s);
SuggestionSearch& operator=(SuggestionSearch&& s);
~SuggestionSearch();

/** Get a set of results for this search.
*
* Returns a subset of title suggestions for the requested range from
* the full set of results.
*
* @param start The begining of the range to get
* (offset of the first result).
Expand All @@ -111,14 +120,54 @@ class LIBZIM_API SuggestionSearch
*/
const SuggestionResultSet getResults(int start, int maxResults) const;

/** Get the number of estimated results for this suggestion search.
/** Get auto-completion suggestions for this search.
*
* Returns auto-completion suggestions for the word preceding the text
* edit location. In the current implementation, the text edit location
* is assumed to be at the end of the query string provided to the
* SuggestionSearch::suggest() method. In the future the text edit
* location will be indicated by a special code-point (e.g.
* carriage-return, form-feed or soft-hyphen) in the query string.
*
* @param maxCount The maximum number of results to return.
*/
Results getAutocompletionSuggestions(uint32_t maxCount) const;

/** Get spelling correction suggestions for this search.
*
* Returns spelling correction suggestions for the word containing the
* text edit location. In the current implementation, the text edit
* location is assumed to be at the end of the query string provided to
* the SuggestionSearch::suggest() method. In the future the text edit
* location will be indicated by a special code-point (e.g.
* carriage-return, form-feed or soft-hyphen) in the query string.
*
* @param maxCount The maximum number of results to return.
*/
Results getSpellingSuggestions(uint32_t maxCount) const;

/** Get the top suggestions for the set upper limit on their count
*
* This method returns the best list of suggestions (title suggestions,
* auto-completion or spelling correction suggestions of the word at
* the edit location or a sorted mix thereof) fitting within the
* specified limit.
*
* @param maxCount The maximum number of results to return.
*/
Results getSmartSuggestions(uint32_t maxCount) const;

/** Get the estimated count of title matches for this suggestion search.
*
* As the name suggest, it is a estimation of the number of results.
* As the name suggest, it is an estimation of the number of results.
* As a member of the initial API, the name of this method conceals
* the fact that only title suggestions are covered by it.
*/
int getEstimatedMatches() const;

private: // methods
SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb, const std::string& query);
SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb,
const std::string& query);

private: // data
std::shared_ptr<SuggestionDataBase> mp_internalDb;
Expand Down
255 changes: 252 additions & 3 deletions src/suggestion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "suggestion_internal.h"
#include "fileimpl.h"
#include "tools.h"
#include "fs_unix.h"
#include "constants.h"

#if defined(ENABLE_XAPIAN)
Expand Down Expand Up @@ -198,9 +199,10 @@ void SuggestionSearcher::initDatabase()
mp_internalDb = std::make_shared<SuggestionDataBase>(m_archive, m_verbose);
}

SuggestionSearch::SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb, const std::string& query)
: mp_internalDb(p_internalDb),
m_query(query)
SuggestionSearch::SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb,
const std::string& query)
: mp_internalDb(p_internalDb)
, m_query(query)
#if defined(ENABLE_XAPIAN)
, mp_enquire(nullptr)
#endif // ENABLE_XAPIAN
Expand Down Expand Up @@ -252,6 +254,253 @@ const SuggestionResultSet SuggestionSearch::getResults(int start, int maxResults
return SuggestionResultSet(entryRange);
}

namespace
{

class QueryInfo
{
public:
explicit QueryInfo(const std::string& query)
{
// XXX: assuming that the query edit location (caret position) is at the end
const size_t lastSpacePos = query.find_last_of(' ');
const size_t startOfLastWord = lastSpacePos != std::string::npos
? lastSpacePos + 1
: 0;
m_queryPrefix = query.substr(0, startOfLastWord);
m_wordToComplete = query.substr(startOfLastWord);
m_wordBeingEdited = m_wordToComplete;
m_querySuffix = "";
}

const std::string& wordToComplete() const { return m_wordToComplete; }
const std::string& wordBeingEdited() const { return m_wordBeingEdited; }

std::string autocompletionSuggestion(const std::string& completedWord) const {
return m_queryPrefix + "<b>" + completedWord + "</b>" + m_querySuffix;
}

std::string spellingSuggestion(const std::string& correctedWord) const {
return m_queryPrefix + "<b>" + correctedWord + "</b>" + m_querySuffix;
}
private:
std::string m_queryPrefix;
std::string m_wordToComplete;
std::string m_wordBeingEdited;
std::string m_querySuffix;
};

} // unnamed namespace

namespace suggestions
{

#if defined(LIBZIM_WITH_XAPIAN) && ! defined(_WIN32)
#define ENABLE_SPELLINGSDB
#endif

#ifdef ENABLE_SPELLINGSDB
class SpellingsDB
{
public: // functions
explicit SpellingsDB(const TermCollection& terms);
~SpellingsDB();

SpellingsDB(const SpellingsDB& ) = delete;
void operator=(const SpellingsDB& ) = delete;

std::vector<std::string> getSpellingCorrections(const std::string& word, uint32_t maxCount) const;

private: // functions
static std::string createTempDir();

private: // data
const std::string tmpDirPath_;
mutable Xapian::WritableDatabase impl_;
};

std::string SpellingsDB::createTempDir()
{
char tmpDirPath[] = "/dev/shm/libzimspellingdb.XXXXXX";
if ( ! mkdtemp(tmpDirPath) ) {
throw std::runtime_error("SpellingsDB: mkdtemp() failed");
}
return tmpDirPath;
}

SpellingsDB::SpellingsDB(const TermCollection& terms)
: tmpDirPath_(createTempDir())
, impl_(tmpDirPath_ + "/spellingdb.xapian", Xapian::DB_BACKEND_GLASS)
{
for (const auto& t : terms) {
impl_.add_spelling(t.term);
}
}

SpellingsDB::~SpellingsDB()
{
unix::FS::remove(tmpDirPath_);
}

std::vector<std::string> SpellingsDB::getSpellingCorrections(const std::string& word, uint32_t maxCount) const {
std::vector<std::string> result;
while ( result.size() < maxCount ) {
const auto term = impl_.get_spelling_suggestion(word);
if ( term.empty() )
break;

result.push_back(term);

// temporarily remove this term so that another spellings could be obtained
impl_.remove_spelling(term);
}

// restore temporarily removed terms
for (const auto& t : result) {
impl_.add_spelling(t);
}

return result;
}
#endif // ENABLE_SPELLINGSDB

} // namespace suggestions

SuggestionDataBase::~SuggestionDataBase() = default;

namespace
{

using namespace suggestions;

bool isXapianTermPrefix(unsigned char c) {
return 'A' <= c && c <= 'Z';
Comment thread
kelson42 marked this conversation as resolved.
}

bool termShouldBeIncludedInAutoCompletions(const std::string& term) {
// XXX: does it make sense to include non-words and stop words?
return term.size() > 0
&& !isXapianTermPrefix(term[0]);
}

TermCollection getAllTerms(const SuggestionDataBase& db) {
TermCollection allTerms;

#ifdef LIBZIM_WITH_XAPIAN
const Xapian::Database& titleDb = db.m_database;
for (auto it = titleDb.allterms_begin(); it != titleDb.allterms_end(); ++it) {
if ( termShouldBeIncludedInAutoCompletions(*it) ) {
allTerms.push_back(TermWithFreq{*it, it.get_termfreq()});
}
}
#endif // LIBZIM_WITH_XAPIAN

std::sort(allTerms.begin(), allTerms.end(), TermWithFreq::dictionaryPred);
return allTerms;
}

TermCollection getTermCompletions(const SuggestionDataBase& db,
const std::string& termPrefix)
{
if ( !db.hasDatabase() ) {
return TermCollection();
}

const TermCollection& allTerms = db.getAllSuggestionTerms();
auto it = std::lower_bound(allTerms.begin(), allTerms.end(),
TermWithFreq{termPrefix, 0},
TermWithFreq::dictionaryPred);

TermCollection result;
for ( ; it != allTerms.end() && startsWith(it->term, termPrefix); ++it ) {
result.push_back(*it);
}

return result;
}

} // unnamed namespace

std::vector<std::string> SuggestionDataBase::getSpellingCorrections(
const std::string& word,
uint32_t maxCount) const
{
#ifdef ENABLE_SPELLINGSDB
if ( this->hasDatabase() ) {
std::lock_guard<std::mutex> locker(m_spellingsDBMutex);
if ( !m_spellingsDB ) {
const TermCollection& allTerms = this->getAllSuggestionTerms();
m_spellingsDB.reset(new SpellingsDB(allTerms));
}
return m_spellingsDB->getSpellingCorrections(word, maxCount);
}
#endif // ENABLE_SPELLINGSDB

return {};
}

const TermCollection& SuggestionDataBase::getAllSuggestionTerms() const
{
std::lock_guard<std::mutex> locker(m_suggestionTermsMutex);
if ( m_suggestionTerms.empty() ) {
m_suggestionTerms = getAllTerms(*this);
}
return m_suggestionTerms;
}

SuggestionSearch::Results SuggestionSearch::getAutocompletionSuggestions(uint32_t maxCount) const {
QueryInfo queryInfo(removeAccents(m_query));

SuggestionSearch::Results r;
if ( queryInfo.wordToComplete().size() > 1 ) {
auto terms = getTermCompletions(*mp_internalDb, queryInfo.wordToComplete());
if (maxCount < terms.size() ) {
std::sort(terms.begin(), terms.end(), TermWithFreq::freqPred);
terms.resize(maxCount);
}
std::sort(terms.begin(), terms.end(), TermWithFreq::dictionaryPred);

for (const auto& t : terms) {
const auto suggestion = queryInfo.autocompletionSuggestion(t.term);
r.push_back(SuggestionItem("", "", suggestion));
}
}

return r;
}

SuggestionSearch::Results SuggestionSearch::getSpellingSuggestions(uint32_t maxCount) const {
QueryInfo queryInfo(removeAccents(m_query));

SuggestionSearch::Results r;
if ( !queryInfo.wordBeingEdited().empty() ) {
const auto terms = mp_internalDb->getSpellingCorrections(queryInfo.wordBeingEdited(), maxCount);

for (const auto& t : terms) {
const auto suggestion = queryInfo.spellingSuggestion(t);
r.push_back(SuggestionItem("", "", suggestion));
}
}

return r;
}
SuggestionSearch::Results SuggestionSearch::getSmartSuggestions(uint32_t maxCount) const {
SuggestionSearch::Results r;
const uint32_t titleSuggestionCount = getEstimatedMatches();
if ( titleSuggestionCount == 0 || titleSuggestionCount > maxCount ) {
r = getAutocompletionSuggestions(maxCount);
if ( r.size() < maxCount ) {
const auto corrections = getSpellingSuggestions(maxCount - r.size());
r.insert(r.end(), corrections.begin(), corrections.end());
}
} else {
for ( const auto& s : getResults(0, maxCount) ) {
r.push_back(s);
}
}
return r;
}

const void SuggestionSearch::forceRangeSuggestion() {
#if defined(ENABLE_XAPIAN)
mp_internalDb->m_database.close();
Expand Down
Loading
Loading