123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220 |
- /* Copyright (c) 2005 MySQL AB, 2009 Sun Microsystems, Inc.
- Use is subject to license terms.
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; version 2 of the License.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */
- #ifndef _my_plugin_ftparser_h
- #define _my_plugin_ftparser_h
- #include "plugin.h"
- #ifdef __cplusplus
- extern "C" {
- #endif
- /*************************************************************************
- API for Full-text parser plugin. (MYSQL_FTPARSER_PLUGIN)
- */
- #define MYSQL_FTPARSER_INTERFACE_VERSION 0x0100
- /* Parsing modes. Set in MYSQL_FTPARSER_PARAM::mode */
- enum enum_ftparser_mode
- {
- /*
- Fast and simple mode. This mode is used for indexing, and natural
- language queries.
- The parser is expected to return only those words that go into the
- index. Stopwords or too short/long words should not be returned. The
- 'boolean_info' argument of mysql_add_word() does not have to be set.
- */
- MYSQL_FTPARSER_SIMPLE_MODE= 0,
- /*
- Parse with stopwords mode. This mode is used in boolean searches for
- "phrase matching."
- The parser is not allowed to ignore words in this mode. Every word
- should be returned, including stopwords and words that are too short
- or long. The 'boolean_info' argument of mysql_add_word() does not
- have to be set.
- */
- MYSQL_FTPARSER_WITH_STOPWORDS= 1,
- /*
- Parse in boolean mode. This mode is used to parse a boolean query string.
- The parser should provide a valid MYSQL_FTPARSER_BOOLEAN_INFO
- structure in the 'boolean_info' argument to mysql_add_word().
- Usually that means that the parser should recognize boolean operators
- in the parsing stream and set appropriate fields in
- MYSQL_FTPARSER_BOOLEAN_INFO structure accordingly. As for
- MYSQL_FTPARSER_WITH_STOPWORDS mode, no word should be ignored.
- Instead, use FT_TOKEN_STOPWORD for the token type of such a word.
- */
- MYSQL_FTPARSER_FULL_BOOLEAN_INFO= 2
- };
- /*
- Token types for boolean mode searching (used for the type member of
- MYSQL_FTPARSER_BOOLEAN_INFO struct)
- FT_TOKEN_EOF: End of data.
- FT_TOKEN_WORD: Regular word.
- FT_TOKEN_LEFT_PAREN: Left parenthesis (start of group/sub-expression).
- FT_TOKEN_RIGHT_PAREN: Right parenthesis (end of group/sub-expression).
- FT_TOKEN_STOPWORD: Stopword.
- */
- enum enum_ft_token_type
- {
- FT_TOKEN_EOF= 0,
- FT_TOKEN_WORD= 1,
- FT_TOKEN_LEFT_PAREN= 2,
- FT_TOKEN_RIGHT_PAREN= 3,
- FT_TOKEN_STOPWORD= 4
- };
- /*
- This structure is used in boolean search mode only. It conveys
- boolean-mode metadata to the MySQL search engine for every word in
- the search query. A valid instance of this structure must be filled
- in by the plugin parser and passed as an argument in the call to
- mysql_add_word (the callback function in the MYSQL_FTPARSER_PARAM
- structure) when a query is parsed in boolean mode.
- type: The token type. Should be one of the enum_ft_token_type values.
- yesno: Whether the word must be present for a match to occur:
- >0 Must be present
- <0 Must not be present
- 0 Neither; the word is optional but its presence increases the relevance
- With the default settings of the ft_boolean_syntax system variable,
- >0 corresponds to the '+' operator, <0 corrresponds to the '-' operator,
- and 0 means neither operator was used.
- weight_adjust: A weighting factor that determines how much a match
- for the word counts. Positive values increase, negative - decrease the
- relative word's importance in the query.
- wasign: The sign of the word's weight in the query. If it's non-negative
- the match for the word will increase document relevance, if it's
- negative - decrease (the word becomes a "noise word", the less of it the
- better).
- trunc: Corresponds to the '*' operator in the default setting of the
- ft_boolean_syntax system variable.
- */
- typedef struct st_mysql_ftparser_boolean_info
- {
- enum enum_ft_token_type type;
- int yesno;
- int weight_adjust;
- char wasign;
- char trunc;
- /* These are parser state and must be removed. */
- char prev;
- char *quot;
- } MYSQL_FTPARSER_BOOLEAN_INFO;
- /*
- The following flag means that buffer with a string (document, word)
- may be overwritten by the caller before the end of the parsing (that is
- before st_mysql_ftparser::deinit() call). If one needs the string
- to survive between two successive calls of the parsing function, she
- needs to save a copy of it. The flag may be set by MySQL before calling
- st_mysql_ftparser::parse(), or it may be set by a plugin before calling
- st_mysql_ftparser_param::mysql_parse() or
- st_mysql_ftparser_param::mysql_add_word().
- */
- #define MYSQL_FTFLAGS_NEED_COPY 1
- /*
- An argument of the full-text parser plugin. This structure is
- filled in by MySQL server and passed to the parsing function of the
- plugin as an in/out parameter.
- mysql_parse: A pointer to the built-in parser implementation of the
- server. It's set by the server and can be used by the parser plugin
- to invoke the MySQL default parser. If plugin's role is to extract
- textual data from .doc, .pdf or .xml content, it might extract
- plaintext from the content, and then pass the text to the default
- MySQL parser to be parsed.
- mysql_add_word: A server callback to add a new word. When parsing
- a document, the server sets this to point at a function that adds
- the word to MySQL full-text index. When parsing a search query,
- this function will add the new word to the list of words to search
- for. The boolean_info argument can be NULL for all cases except
- when mode is MYSQL_FTPARSER_FULL_BOOLEAN_INFO.
- ftparser_state: A generic pointer. The plugin can set it to point
- to information to be used internally for its own purposes.
- mysql_ftparam: This is set by the server. It is used by MySQL functions
- called via mysql_parse() and mysql_add_word() callback. The plugin
- should not modify it.
- cs: Information about the character set of the document or query string.
- doc: A pointer to the document or query string to be parsed.
- length: Length of the document or query string, in bytes.
- flags: See MYSQL_FTFLAGS_* constants above.
- mode: The parsing mode. With boolean operators, with stopwords, or
- nothing. See enum_ftparser_mode above.
- */
- typedef struct st_mysql_ftparser_param
- {
- int (*mysql_parse)(struct st_mysql_ftparser_param *,
- const char *doc, int doc_len);
- int (*mysql_add_word)(struct st_mysql_ftparser_param *,
- const char *word, int word_len,
- MYSQL_FTPARSER_BOOLEAN_INFO *boolean_info);
- void *ftparser_state;
- void *mysql_ftparam;
- const struct charset_info_st *cs;
- const char *doc;
- int length;
- unsigned int flags;
- enum enum_ftparser_mode mode;
- } MYSQL_FTPARSER_PARAM;
- /*
- Full-text parser descriptor.
- interface_version is, e.g., MYSQL_FTPARSER_INTERFACE_VERSION.
- The parsing, initialization, and deinitialization functions are
- invoked per SQL statement for which the parser is used.
- */
- struct st_mysql_ftparser
- {
- int interface_version;
- int (*parse)(MYSQL_FTPARSER_PARAM *param);
- int (*init)(MYSQL_FTPARSER_PARAM *param);
- int (*deinit)(MYSQL_FTPARSER_PARAM *param);
- };
- #ifdef __cplusplus
- }
- #endif
- #endif
|