diff options
author | Siegfried-Angel Gevatter Pujals <siegfried@gevatter.com> | 2012-03-05 21:40:24 +0100 |
---|---|---|
committer | Michal Hruby <michal.mhr@gmail.com> | 2012-03-05 21:40:24 +0100 |
commit | e4473a3aabb678c8b5a3ad6e2cef1950a0f414e2 (patch) | |
tree | a259f5c362b7fe8400e32d1e49c2fcc6946fb064 | |
parent | 9909a2f7e3a955a09c7b34f97b22ab36869c3f23 (diff) |
FTS++: Save hashes of URIs and use Xapian's collapse option to group by them
when querying with *_SUBJECT result types.
-rw-r--r-- | extensions/fts++/indexer.cpp | 34 | ||||
-rw-r--r-- | extensions/fts++/indexer.h | 4 |
2 files changed, 37 insertions, 1 deletions
diff --git a/extensions/fts++/indexer.cpp b/extensions/fts++/indexer.cpp index fc7ba8da..cc4a8f72 100644 --- a/extensions/fts++/indexer.cpp +++ b/extensions/fts++/indexer.cpp @@ -23,6 +23,7 @@ #include <xapian.h> #include <queue> #include <vector> +#include <cassert> #include <gio/gio.h> #include <gio/gdesktopappinfo.h> @@ -42,6 +43,7 @@ const std::string FILTER_PREFIX_XDG_CATEGORY = "AC"; const Xapian::valueno VALUE_EVENT_ID = 0; const Xapian::valueno VALUE_TIMESTAMP = 1; +const Xapian::valueno VALUE_URI_HASH = 2; #define QUERY_PARSER_FLAGS \ Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \ @@ -101,6 +103,11 @@ void Indexer::Initialize (GError **error) this->query_parser->set_database (*this->db); this->enquire = new Xapian::Enquire (*this->db); + + assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == 16); + this->checksum = g_checksum_new (G_CHECKSUM_MD5); + if (!this->checksum) + g_critical ("GChecksum initialization failed."); } catch (const Xapian::Error &xp_error) @@ -728,7 +735,11 @@ GPtrArray* Indexer::Search (const gchar *search, guint maxhits; if (result_type == RELEVANCY_RESULT_TYPE || result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS || - result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS) + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS) { maxhits = count; } @@ -746,6 +757,14 @@ GPtrArray* Indexer::Search (const gchar *search, enquire->set_sort_by_value (VALUE_TIMESTAMP, true); } + if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS || + result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS) + { + enquire->set_collapse_key (VALUE_URI_HASH); + } + Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS)); enquire->set_query (q); Xapian::MSet hits (enquire->get_mset (offset, maxhits)); @@ -989,6 +1008,19 @@ void Indexer::IndexEvent (ZeitgeistEvent *event) return; // ignore this event completely... } + // We need the subject URI so we can use Xapian's collapse key feature + // for *_SUBJECT grouping. However, to save space, we'll just save a hash. + // A better option would be using URI's id, but for that we'd need a SQL + // query that'd be subject to races. + // FIXME(?): This doesn't work for events with multiple subjects. + g_checksum_update (checksum, (guchar *) uri.c_str (), -1); + guint8 uri_hash[17]; + gsize hash_size = 16; + g_checksum_get_digest (checksum, uri_hash, &hash_size); + assert (hash_size == 16); + doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, 16)); + g_checksum_reset (checksum); + val = zeitgeist_subject_get_text (subject); if (val && val[0] != '\0') { diff --git a/extensions/fts++/indexer.h b/extensions/fts++/indexer.h index 2d41c3ec..af64cfee 100644 --- a/extensions/fts++/indexer.h +++ b/extensions/fts++/indexer.h @@ -21,6 +21,7 @@ #define _ZGFTS_INDEXER_H_ #include <glib-object.h> +#include <glib/gchecksum.h> #include <gio/gio.h> #include <xapian.h> @@ -42,6 +43,7 @@ public: , query_parser (NULL) , enquire (NULL) , tokenizer (NULL) + , checksum (NULL) , clear_failed_id (0) { const gchar *home_dir = g_get_home_dir (); @@ -54,6 +56,7 @@ public: if (enquire) delete enquire; if (query_parser) delete query_parser; if (db) delete db; + if (checksum) { g_checksum_free (checksum); checksum = NULL; } for (AppInfoMap::iterator it = app_info_cache.begin (); it != app_info_cache.end (); ++it) @@ -120,6 +123,7 @@ private: Xapian::TermGenerator *tokenizer; AppInfoMap app_info_cache; ApplicationSet failed_lookups; + GChecksum *checksum; guint clear_failed_id; std::string home_dir_path; |