summaryrefslogtreecommitdiff
path: root/i18npool
diff options
context:
space:
mode:
authorMike Kaganski <mike.kaganski@collabora.com>2019-12-18 15:23:08 +0300
committerMike Kaganski <mike.kaganski@collabora.com>2019-12-19 05:29:55 +0100
commiteb973a46ba0e34026db2d2929f2aa10505623872 (patch)
tree79dbe573816d9984ee18eea5bdbaf72420f6ab1c /i18npool
parent9ed0afc79b7da20d08153c2c3e302b061292358f (diff)
tdf#75806: use actual string leading characters for correct precondition match
Having an arbitrary prepended character prevents e.g. matching start of a word for search continuation. Trimming the string to the start of the search breaks correct look-behind assertion matching (e.g. for regexes like `(?<!abc)abc`). As Michael Stahl suggested, we should use actual preceding characters instead of the arbitraty prefix. Let's use up to 100 preceding characters in the hope that this would be fast enough, and yet cover 99.999% of useful assertions. When the search string does not start with a look-behind assertion, use up to 3 preceding characters (to account for UTF-16 surrogate pairs). Change-Id: Ie19238ac792116c1d52fb2454d3142e35b6ed379 Reviewed-on: https://gerrit.libreoffice.org/85382 Tested-by: Jenkins Reviewed-by: Mike Kaganski <mike.kaganski@collabora.com>
Diffstat (limited to 'i18npool')
-rw-r--r--i18npool/source/search/textsearch.cxx35
1 files changed, 18 insertions, 17 deletions
diff --git a/i18npool/source/search/textsearch.cxx b/i18npool/source/search/textsearch.cxx
index 4174c6cd1e86..b0361bee89a3 100644
--- a/i18npool/source/search/textsearch.cxx
+++ b/i18npool/source/search/textsearch.cxx
@@ -310,8 +310,22 @@ SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 sta
if ( xTranslit.is() )
{
// apply normal transliteration (1<->1, 1<->0)
- css::uno::Sequence<sal_Int32> offset(endPos - startPos);
- in_str = xTranslit->transliterate( searchStr, startPos, endPos - startPos, offset );
+
+ sal_Int32 nInStartPos = startPos;
+ if (pRegexMatcher && startPos > 0)
+ {
+ // tdf#89665, tdf#75806: An optimization to avoid transliterating the whole string, yet
+ // transliterate enough of the leading text to allow sensible look-behind assertions.
+ // 100 is chosen arbitrarily in the hope that look-behind assertions would largely fit.
+ // See http://userguide.icu-project.org/strings/regexp for look-behind assertion syntax.
+ // When search regex doesn't start with an assertion, 3 is to allow startPos to be in
+ // the middle of a surrogate pair, preceded by another surrogate pair.
+ const sal_Int32 nMaxLeadingLen = aSrchPara.searchString.startsWith("(?") ? 100 : 3;
+ nInStartPos -= std::min(nMaxLeadingLen, startPos);
+ }
+
+ css::uno::Sequence<sal_Int32> offset(endPos - nInStartPos);
+ in_str = xTranslit->transliterate( searchStr, nInStartPos, endPos - nInStartPos, offset );
// JP 20.6.2001: also the start and end positions must be corrected!
sal_Int32 newStartPos =
@@ -321,19 +335,6 @@ SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 sta
? FindPosInSeq_Impl( offset, endPos )
: in_str.getLength();
- sal_Int32 nExtraOffset = 0;
- if (pRegexMatcher && startPos > 0)
- {
- // avoid matching ^ here - in_str omits a prefix of the searchStr
- // this is a really lame way to do it, but ICU only offers
- // useAnchoringBounds() to disable *both* bounds but what is needed
- // here is to disable only one bound and respect the other
- in_str = "X" + in_str;
- nExtraOffset = 1;
- newStartPos += nExtraOffset;
- newEndPos += nExtraOffset;
- }
-
sres = (this->*fnForward)( in_str, newStartPos, newEndPos );
// Map offsets back to untransliterated string.
@@ -345,7 +346,7 @@ SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 sta
const sal_Int32 nGroups = sres.startOffset.getLength();
for ( sal_Int32 k = 0; k < nGroups; k++ )
{
- const sal_Int32 nStart = sres.startOffset[k] - nExtraOffset;
+ const sal_Int32 nStart = sres.startOffset[k];
// Result offsets are negative (-1) if a group expression was
// not matched.
if (nStart >= 0)
@@ -354,7 +355,7 @@ SearchResult TextSearch::searchForward( const OUString& searchStr, sal_Int32 sta
// the position of the next character - return the
// next position behind the last found character!
// "a b c" find "b" must return 2,3 and not 2,4!!!
- const sal_Int32 nStop = sres.endOffset[k] - nExtraOffset;
+ const sal_Int32 nStop = sres.endOffset[k];
if (nStop >= 0)
{
if (nStop > 0)