diff options
author | Michal Hruby <michal.mhr@gmail.com> | 2012-03-12 15:22:16 +0100 |
---|---|---|
committer | Michal Hruby <michal.mhr@gmail.com> | 2012-03-12 15:22:16 +0100 |
commit | 3bcebed430967549337b9ab6ff6e14a3b960cd24 (patch) | |
tree | 7ea02294dd0f396f5fc6760527bedf7400a9e6a4 | |
parent | 2a254ebb54bbec022447495b361c7013e8bf5b07 (diff) |
Index only recognized uri schemes
-rw-r--r-- | extensions/fts++/indexer.cpp | 46 | ||||
-rw-r--r-- | extensions/fts++/indexer.h | 6 |
2 files changed, 35 insertions, 17 deletions
diff --git a/extensions/fts++/indexer.cpp b/extensions/fts++/indexer.cpp index bab4bbea..e7d0c6bf 100644 --- a/extensions/fts++/indexer.cpp +++ b/extensions/fts++/indexer.cpp @@ -106,9 +106,17 @@ void Indexer::Initialize (GError **error) g_assert (g_checksum_type_get_length (G_CHECKSUM_MD5) == HASH_LENGTH); this->checksum = g_checksum_new (G_CHECKSUM_MD5); - if (!this->checksum) - g_critical ("GChecksum initialization failed."); + if (!this->checksum) g_critical ("GChecksum initialization failed."); + GError *error = NULL; + /* we need to be careful with what we log, for example ubuntuone logs its + * weird uids and that screws up the index */ + this->uri_schemes_regex = g_regex_new ( + "(file|http[s]?|[s]?ftp|ssh|smb|dav[s]?|application)$", G_REGEX_OPTIMIZE, + (GRegexMatchFlags) 0, &error); + + if (error) + g_critical ("Unable to initialize uri scheme regex: %s", error->message); } catch (const Xapian::Error &xp_error) { @@ -399,7 +407,7 @@ void Indexer::IndexText (std::string const& text) tokenizer->index_text (StringUtils::AsciiFold (text), 5); } -void Indexer::IndexUri (std::string const& uri, std::string const& origin) +bool Indexer::IndexUri (std::string const& uri, std::string const& origin) { GFile *f = g_file_new_for_uri (uri.c_str ()); @@ -407,12 +415,21 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin) if (scheme == NULL) { g_warning ("Invalid URI: %s", uri.c_str ()); - return; + g_object_unref (f); + return false; } std::string scheme_str(scheme); g_free (scheme); + // do we support this scheme? + if (!g_regex_match (uri_schemes_regex, scheme_str.c_str (), + (GRegexMatchFlags) 0, NULL)) + { + g_object_unref (f); + return false; + } + if (scheme_str == "file") { // FIXME: special case some typical filenames (like photos) @@ -462,7 +479,7 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin) weight_index < G_N_ELEMENTS (path_weights)) { // if this is already home directory we don't want it - if (path_component == home_dir_path) return; + if (path_component == home_dir_path) break; gchar *name = g_path_get_basename (path_component.c_str ()); @@ -481,10 +498,11 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin) // mailto:username@server.com size_t scheme_len = scheme_str.length () + 1; size_t at_pos = uri.find ('@', scheme_len); - if (at_pos == std::string::npos) return; - - tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5); - tokenizer->index_text (uri.substr (at_pos + 1), 1); + if (at_pos != std::string::npos) + { + tokenizer->index_text (uri.substr (scheme_len, at_pos - scheme_len), 5); + tokenizer->index_text (uri.substr (at_pos + 1), 1); + } } else if (scheme_str.compare (0, 4, "http") == 0) { @@ -578,6 +596,8 @@ void Indexer::IndexUri (std::string const& uri, std::string const& origin) } g_object_unref (f); + + return true; } bool Indexer::IndexActor (std::string const& actor, bool is_subject) @@ -1035,15 +1055,11 @@ void Indexer::IndexEvent (ZeitgeistEvent *event) if (!IndexActor (uri, true)) IndexUri (uri, origin); } - else if (uri.compare (0, 10, "ubuntuone:") == 0) + else if (!IndexUri (uri, origin)) { - // U1 logs its uids, we don't want to index those + // unsupported uri scheme return; } - else - { - IndexUri (uri, origin); - } } AddDocFilters (event, doc); diff --git a/extensions/fts++/indexer.h b/extensions/fts++/indexer.h index f1cc2f11..1fbbb57d 100644 --- a/extensions/fts++/indexer.h +++ b/extensions/fts++/indexer.h @@ -57,6 +57,7 @@ public: if (query_parser) delete query_parser; if (db) delete db; if (checksum) g_checksum_free (checksum); + if (uri_schemes_regex) g_regex_unref (uri_schemes_regex); for (AppInfoMap::iterator it = app_info_cache.begin (); it != app_info_cache.end (); ++it) @@ -111,7 +112,7 @@ private: void AddDocFilters (ZeitgeistEvent *event, Xapian::Document &doc); void IndexText (std::string const& text); - void IndexUri (std::string const& uri, std::string const& origin); + bool IndexUri (std::string const& uri, std::string const& origin); bool IndexActor (std::string const& actor, bool is_subject); gboolean ClearFailedLookupsCb (); @@ -123,7 +124,8 @@ private: Xapian::TermGenerator *tokenizer; AppInfoMap app_info_cache; ApplicationSet failed_lookups; - GChecksum *checksum; + GChecksum *checksum; + GRegex *uri_schemes_regex; guint clear_failed_id; std::string home_dir_path; |