summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichal Hruby <michal.mhr@gmail.com>2012-03-12 15:22:16 +0100
committerMichal Hruby <michal.mhr@gmail.com>2012-03-12 15:22:16 +0100
commit3bcebed430967549337b9ab6ff6e14a3b960cd24 (patch)
tree7ea02294dd0f396f5fc6760527bedf7400a9e6a4
parent2a254ebb54bbec022447495b361c7013e8bf5b07 (diff)
Index only recognized uri schemes
-rw-r--r--extensions/fts++/indexer.cpp46
-rw-r--r--extensions/fts++/indexer.h6
2 files changed, 35 insertions, 17 deletions
diff --git a/extensions/fts++/indexer.cpp b/extensions/fts++/indexer.cpp
index bab4bbea..e7d0c6bf 100644
--- a/extensions/fts++/indexer.cpp
+++ b/extensions/fts++/indexer.cpp
@@ -106,9 +106,17 @@ void Indexer::Initialize (GError **error)
g_assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == HASH_LENGTH);
this->checksum = g_checksum_new (G_CHECKSUM_MD5);
- if (!this->checksum)
- g_critical ("GChecksum initialization failed.");
+ if (!this->checksum) g_critical ("GChecksum initialization failed.");
+ GError *error = NULL;
+ /* we need to be careful with what we log, for example ubuntuone logs its
+ * weird uids and that screws up the index */
+ this->uri_schemes_regex = g_regex_new (
+ "(file|http[s]?|[s]?ftp|ssh|smb|dav[s]?|application)$", G_REGEX_OPTIMIZE,
+ (GRegexMatchFlags) 0, &error);
+
+ if (error)
+ g_critical ("Unable to initialize uri scheme regex: %s", error->message);
}
catch (const Xapian::Error &xp_error)
{
@@ -399,7 +407,7 @@ void Indexer::IndexText (std::string const& text)
tokenizer->index_text (StringUtils::AsciiFold (text), 5);
}
-void Indexer::IndexUri (std::string const& uri, std::string const& origin)
+bool Indexer::IndexUri (std::string const& uri, std::string const& origin)
{
GFile *f = g_file_new_for_uri (uri.c_str ());
@@ -407,12 +415,21 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin)
if (scheme == NULL)
{
g_warning ("Invalid URI: %s", uri.c_str ());
- return;
+ g_object_unref (f);
+ return false;
}
std::string scheme_str(scheme);
g_free (scheme);
+ // do we support this scheme?
+ if (!g_regex_match (uri_schemes_regex, scheme_str.c_str (),
+ (GRegexMatchFlags) 0, NULL))
+ {
+ g_object_unref (f);
+ return false;
+ }
+
if (scheme_str == "file")
{
// FIXME: special case some typical filenames (like photos)
@@ -462,7 +479,7 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin)
weight_index < G_N_ELEMENTS (path_weights))
{
// if this is already home directory we don't want it
- if (path_component == home_dir_path) return;
+ if (path_component == home_dir_path) break;
gchar *name = g_path_get_basename (path_component.c_str ());
@@ -481,10 +498,11 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin)
// mailto:username@server.com
size_t scheme_len = scheme_str.length () + 1;
size_t at_pos = uri.find ('@', scheme_len);
- if (at_pos == std::string::npos) return;
-
- tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5);
- tokenizer->index_text (uri.substr (at_pos + 1), 1);
+ if (at_pos != std::string::npos)
+ {
+ tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5);
+ tokenizer->index_text (uri.substr (at_pos + 1), 1);
+ }
}
else if (scheme_str.compare (0, 4, "http") == 0)
{
@@ -578,6 +596,8 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin)
}
g_object_unref (f);
+
+ return true;
}
bool Indexer::IndexActor (std::string const& actor, bool is_subject)
@@ -1035,15 +1055,11 @@ void Indexer::IndexEvent (ZeitgeistEvent *event)
if (!IndexActor (uri, true))
IndexUri (uri, origin);
}
- else if (uri.compare (0, 10, "ubuntuone:") == 0)
+ else if (!IndexUri (uri, origin))
{
- // U1 logs its uids, we don't want to index those
+ // unsupported uri scheme
return;
}
- else
- {
- IndexUri (uri, origin);
- }
}
AddDocFilters (event, doc);
diff --git a/extensions/fts++/indexer.h b/extensions/fts++/indexer.h
index f1cc2f11..1fbbb57d 100644
--- a/extensions/fts++/indexer.h
+++ b/extensions/fts++/indexer.h
@@ -57,6 +57,7 @@ public:
if (query_parser) delete query_parser;
if (db) delete db;
if (checksum) g_checksum_free (checksum);
+ if (uri_schemes_regex) g_regex_unref (uri_schemes_regex);
for (AppInfoMap::iterator it = app_info_cache.begin ();
it != app_info_cache.end (); ++it)
@@ -111,7 +112,7 @@ private:
void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc);
void IndexText (std::string const& text);
- void IndexUri (std::string const& uri, std::string const& origin);
+ bool IndexUri (std::string const& uri, std::string const& origin);
bool IndexActor (std::string const& actor, bool is_subject);
gboolean ClearFailedLookupsCb ();
@@ -123,7 +124,8 @@ private:
Xapian::TermGenerator *tokenizer;
AppInfoMap app_info_cache;
ApplicationSet failed_lookups;
- GChecksum *checksum;
+ GChecksum *checksum;
+ GRegex *uri_schemes_regex;
guint clear_failed_id;
std::string home_dir_path;