summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSiegfried-Angel Gevatter Pujals <siegfried@gevatter.com>2012-03-05 21:40:24 +0100
committerMichal Hruby <michal.mhr@gmail.com>2012-03-05 21:40:24 +0100
commite4473a3aabb678c8b5a3ad6e2cef1950a0f414e2 (patch)
treea259f5c362b7fe8400e32d1e49c2fcc6946fb064
parent9909a2f7e3a955a09c7b34f97b22ab36869c3f23 (diff)
FTS++: Save hashes of URIs and use Xapian's collapse option to group by them
when querying with *_SUBJECT result types.
-rw-r--r--extensions/fts++/indexer.cpp34
-rw-r--r--extensions/fts++/indexer.h4
2 files changed, 37 insertions, 1 deletions
diff --git a/extensions/fts++/indexer.cpp b/extensions/fts++/indexer.cpp
index fc7ba8da..cc4a8f72 100644
--- a/extensions/fts++/indexer.cpp
+++ b/extensions/fts++/indexer.cpp
@@ -23,6 +23,7 @@
#include <xapian.h>
#include <queue>
#include <vector>
+#include <cassert>
#include <gio/gio.h>
#include <gio/gdesktopappinfo.h>
@@ -42,6 +43,7 @@ const std::string FILTER_PREFIX_XDG_CATEGORY = "AC";
const Xapian::valueno VALUE_EVENT_ID = 0;
const Xapian::valueno VALUE_TIMESTAMP = 1;
+const Xapian::valueno VALUE_URI_HASH = 2;
#define QUERY_PARSER_FLAGS \
Xapian::QueryParser::FLAG_PHRASE | Xapian::QueryParser::FLAG_BOOLEAN | \
@@ -101,6 +103,11 @@ void Indexer::Initialize (GError **error)
this->query_parser->set_database (*this->db);
this->enquire = new Xapian::Enquire (*this->db);
+
+ assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == 16);
+ this->checksum = g_checksum_new (G_CHECKSUM_MD5);
+ if (!this->checksum)
+ g_critical ("GChecksum initialization failed.");
}
catch (const Xapian::Error &xp_error)
@@ -728,7 +735,11 @@ GPtrArray* Indexer::Search (const gchar *search,
guint maxhits;
if (result_type == RELEVANCY_RESULT_TYPE ||
result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_EVENTS ||
- result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS)
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_EVENTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
{
maxhits = count;
}
@@ -746,6 +757,14 @@ GPtrArray* Indexer::Search (const gchar *search,
enquire->set_sort_by_value (VALUE_TIMESTAMP, true);
}
+ if (result_type == ZEITGEIST_RESULT_TYPE_MOST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_RECENT_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_MOST_POPULAR_SUBJECTS ||
+ result_type == ZEITGEIST_RESULT_TYPE_LEAST_POPULAR_SUBJECTS)
+ {
+ enquire->set_collapse_key (VALUE_URI_HASH);
+ }
+
Xapian::Query q(query_parser->parse_query (query_string, QUERY_PARSER_FLAGS));
enquire->set_query (q);
Xapian::MSet hits (enquire->get_mset (offset, maxhits));
@@ -989,6 +1008,19 @@ void Indexer::IndexEvent (ZeitgeistEvent *event)
return; // ignore this event completely...
}
+ // We need the subject URI so we can use Xapian's collapse key feature
+ // for *_SUBJECT grouping. However, to save space, we'll just save a hash.
+ // A better option would be using URI's id, but for that we'd need a SQL
+ // query that'd be subject to races.
+ // FIXME(?): This doesn't work for events with multiple subjects.
+ g_checksum_update (checksum, (guchar *) uri.c_str (), -1);
+ guint8 uri_hash[17];
+ gsize hash_size = 16;
+ g_checksum_get_digest (checksum, uri_hash, &hash_size);
+ assert (hash_size == 16);
+ doc.add_value (VALUE_URI_HASH, std::string((char *) uri_hash, 16));
+ g_checksum_reset (checksum);
+
val = zeitgeist_subject_get_text (subject);
if (val && val[0] != '\0')
{
diff --git a/extensions/fts++/indexer.h b/extensions/fts++/indexer.h
index 2d41c3ec..af64cfee 100644
--- a/extensions/fts++/indexer.h
+++ b/extensions/fts++/indexer.h
@@ -21,6 +21,7 @@
#define _ZGFTS_INDEXER_H_
#include <glib-object.h>
+#include <glib/gchecksum.h>
#include <gio/gio.h>
#include <xapian.h>
@@ -42,6 +43,7 @@ public:
, query_parser (NULL)
, enquire (NULL)
, tokenizer (NULL)
+ , checksum (NULL)
, clear_failed_id (0)
{
const gchar *home_dir = g_get_home_dir ();
@@ -54,6 +56,7 @@ public:
if (enquire) delete enquire;
if (query_parser) delete query_parser;
if (db) delete db;
+ if (checksum) { g_checksum_free (checksum); checksum = NULL; }
for (AppInfoMap::iterator it = app_info_cache.begin ();
it != app_info_cache.end (); ++it)
@@ -120,6 +123,7 @@ private:
Xapian::TermGenerator *tokenizer;
AppInfoMap app_info_cache;
ApplicationSet failed_lookups;
+ GChecksum *checksum;
guint clear_failed_id;
std::string home_dir_path;