Commit cbfe0b77 authored by Ard Schrijvers's avatar Ard Schrijvers

ENT-244 add unit tests for query parsing, copied improved logic from hst query...

ENT-244 add unit tests for query parsing, copied improved logic from hst query parsing to hst mode parsing, and also added a small improvement for cms searches regarding the - sign. Exclamation marks where already stripped
parent e1af5efe
......@@ -2,4 +2,5 @@
/LICENSE -text
/NOTICE -text
jcr/src/main/resources/hipposearch.cnd -text
jcr/src/test/java/org/onehippo/cms7/services/search/jcr/query/TestFullTextSearchParser.java -text svneol=unset#text/plain
jcr/src/test/resources/testproject-types.cnd -text
......@@ -31,6 +31,8 @@ public final class FullTextSearchParser {
private static final String DEFAULT_IGNORED_CHARS = "&|!(){}[]^\"~*?:\\";
private static final char MINUS_SIGN = '-';
private final static String ignoredChars = DEFAULT_IGNORED_CHARS;
private final static int minimalLength = 3;
......@@ -81,7 +83,19 @@ public final class FullTextSearchParser {
if (c == '\'') {
tb.append('\\');
}
tb.append(c);
if (c == MINUS_SIGN) {
// we do not allowe minus sign followed by a space or ignored char
if (token.length() > i + 1) {
char nextChar = token.charAt(i + 1);
if (nextChar == ' ' || ignoredChars.indexOf(nextChar) > -1) {
// not allowed position for -
} else {
tb.append(c);
}
}
} else {
tb.append(c);
}
}
}
if (tb.length() == 0) {
......@@ -141,17 +155,17 @@ public final class FullTextSearchParser {
public static int getMinimalLength() {
return minimalLength;
}
/**
* <p>
* Removes invalid chars, escapes some chars. If <code>allowSingleNonLeadingWildCard</code> is <code>true</code>, there
* is one single non leading <code>*</code> or <code>?</code> allowed. Note, that this wildcard is not allowed to be
* Removes invalid chars, escapes some chars. If <code>allowSingleNonLeadingWildCard</code> is <code>true</code>, there
* is one single non leading <code>*</code> or <code>?</code> allowed. Note, that this wildcard is not allowed to be
* leading of a new word.
* </p>
* <p>
* Recommended is to remove all wildcards
* </p>
* </p>
* @param input
* @param allowSingleNonLeadingWildCardPerTerm
* @return formatted version of <code>input</code>
......@@ -160,60 +174,88 @@ public final class FullTextSearchParser {
if(input == null) {
throw new IllegalArgumentException("Input is not allowed to be null");
}
StringBuilder sb = new StringBuilder();
StringBuffer sb = new StringBuffer();
boolean allowWildCardInCurrentTerm = allowSingleNonLeadingWildCardPerTerm;
boolean prevCharIsSpecialOrRemoved = false;
for (int i = 0; i < input.length(); i++) {
char c = input.charAt(i);
// Some of these characters break the jcr query and others like * and ? have a very negative impact
// on performance.
if ( c == '(' || c == ')' || c == '^' || c == '[' || c == ']' || c == '{'
|| c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&') {
// ~ is used for synonyms search: This is jackrabbit specific and different than lucene fuzzy
// see http://wiki.apache.org/jackrabbit/SynonymSearch. It is only allowed to be the first char
// of a word
if(c == '~') {
if(sb.length() == 0) {
sb.append(c);
} else {
char prevChar = sb.charAt(sb.length() - 1);
if(prevChar == ' ') {
sb.append(c);
}
// else we remove the ~
}
} else if(sb.length() > 0) {
// if one wildcard is allowed, it will be added but never as leading
if(c == '*' || c == '?') {
if(allowWildCardInCurrentTerm) {
// check if prev char is not a space or " or '
// i must be > 0 here
char prevChar = sb.charAt(sb.length() -1);
if(!(prevChar == '\"' || prevChar == '\'' || prevChar == ' ')) {
sb.append(c);
allowWildCardInCurrentTerm = false;
}
}
}
}
} else if (c == '\"') {
sb.append('\\');
sb.append(c);
} else if (c == '\'') {
// we strip ' because jackrabbit xpath builder breaks on \' (however it should be possible according spec)
} else if (c == ' ') {
// next term. set allowWildCardInCurrentTerm again to allowSingleNonLeadingWildCardPerTerm
allowWildCardInCurrentTerm = allowSingleNonLeadingWildCardPerTerm;
sb.append(c);
} else {
sb.append(c);
}
char c = input.charAt(i);
// Some of these characters break the jcr query and others like * and ? have a very negative impact
// on performance.
if (isSpecialChar(c)) {
if (c == '\"') {
sb.append('\\');
sb.append(c);
} else if (c == '\'') {
// we strip ' because jackrabbit xpath builder breaks on \' (however it should be possible according spec)
} else if (c == ' ') {
// next term. set allowWildCardInCurrentTerm again to allowSingleNonLeadingWildCardPerTerm
allowWildCardInCurrentTerm = allowSingleNonLeadingWildCardPerTerm;
sb.append(c);
} else {
// '~' is used for synonyms search: This is jackrabbit specific and different than lucene fuzzy
// see http://wiki.apache.org/jackrabbit/SynonymSearch. It is only allowed to be the first char
// of a word
// '!' or '-' are used to NOT a term. However, we only allow them at the beginning of a term as they
// do not make sense any where else
if (c == '~' || c == '!' || c == '-') {
if (sb.length() == 0) {
if (containsNextCharAndIsNotSpecial(input, i)) {
sb.append(c);
}
} else {
char prevChar = sb.charAt(sb.length() - 1);
if (prevChar == ' ') {
sb.append(c);
} else if (c == '-') {
// check next char : only if next char is again a non-special char we include the '-'
if (containsNextCharAndIsNotSpecial(input, i)) {
sb.append(c);
}
}
// else we remove the ~ , !, -
}
} else if (sb.length() > 0) {
// if one wildcard is allowed, it will be added but never as leading
// also if the wildcard is found after a special char (like '!', '-', '&', ' ' etc, it will be skipped as well)
if (c == '*' || c == '?') {
if (allowWildCardInCurrentTerm && !prevCharIsSpecialOrRemoved) {
// check if prev char is not a space or " or '
// i must be > 0 here
char prevChar = sb.charAt(sb.length() - 1);
if (!(prevChar == '\"' || prevChar == '\'' || prevChar == ' ')) {
sb.append(c);
allowWildCardInCurrentTerm = false;
}
}
}
}
}
prevCharIsSpecialOrRemoved = true;
} else {
sb.append(c);
prevCharIsSpecialOrRemoved = false;
}
}
String output = sb.toString();
if(!input.equals(output)) {
log.debug("Rewrote input '{}' to '{}'", input, output);
log.debug("Rewrote input '{}' to '{}'", input, output);
}
return output;
}
}
private static boolean containsNextCharAndIsNotSpecial(final String input, final int cursor) {
if ((input.length() > cursor + 1) && !isSpecialChar(input.charAt(cursor + 1))) {
return true;
}
return false;
}
public static boolean isSpecialChar(final char c) {
return c == '(' || c == ')' || c == '^' || c == '[' || c == ']' || c == '{'
|| c == '}' || c == '~' || c == '*' || c == '?' || c == '|' || c == '&'
|| c =='!' || c == '-' || c == '\"' || c == '\'' || c == ' ';
}
/**
* Rewrites any "NOT" operators in the keywords to a minus symbol (-). This is necessary because Jackrabbit doesn't
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment