gruut package

Submodules

gruut.const module

Shared classes, types, and enums

class gruut.const.BreakNode(node, element=None, voice='', lang='', implicit=False, time='')

Bases: gruut.const.Node

Represents a user-specified break

get_milliseconds()

Get number of milliseconds from the time string

Return type

int

time: str = ''

Length of break in seconds (123s) or milliseconds (123ms)

class gruut.const.BreakType(value)

Bases: str, enum.Enum

Types of sentence breaks

MAJOR = 'major'

Break between sentences

MINOR = 'minor'

Break between phrases

class gruut.const.BreakWordNode(node, element=None, voice='', lang='', implicit=False, break_type='', text='', text_with_ws='')

Bases: gruut.const.Node

Represents a major/minor break in the text

break_type: Union[str, gruut.const.BreakType] = ''
text: str = ''
text_with_ws: str = ''
class gruut.const.EndElement(element)

Bases: object

Wrapper for end of an XML element (used in TextProcessor)

element: xml.etree.ElementTree.Element
class gruut.const.GetPartsOfSpeech

Bases: object

Get part of speech tags for words

class gruut.const.GraphType

Bases: object

Type wrapper for networkx graph

add_edge(src, dst)

Add a new edge to the graph

add_edges_from(edges)

Add edges from iterable

add_node(node, **kwargs)

Add a new node to the graph

nodes: Dict[int, Dict[Any, Any]]

Get node data for the graph

out_degree(node)

Get number of outgoing edges from a node

Return type

int

out_edges(node)

Yield outgoing edges from a node

Return type

Iterable[Tuple[int, int]]

predecessors(node)

Yield nodes from incoming edges

Return type

Iterable[int]

remove_edges_from(edges)

Remove edges from iterable

successors(node)

Yield nodes on outgoing edges

Return type

Iterable[int]

class gruut.const.GuessPhonemes

Bases: object

Guess phonemes for word/role

class gruut.const.IgnoreNode(node, element=None, voice='', lang='', implicit=False)

Bases: gruut.const.Node

Node should be ignored

node: int
class gruut.const.InlineLexicon(lexicon_id, alphabet='', words=<factory>)

Bases: object

SSML lexicon defined inline (not standards compliant)

alphabet: str = ''
lexicon_id: str
words: Dict[str, Dict[str, Sequence[str]]]
class gruut.const.InterpretAs(value)

Bases: str, enum.Enum

Supported options for interpret-as attribute of <say-as>

CURRENCY = 'currency'

Word should be interpreted as an amount of currency

DATE = 'date'

Word should be interpreted as a date

NUMBER = 'number'

Word should be interpreted as a number

SPELL_OUT = 'spell-out'

Word should be spelled out (abc = a b c)

TIME = 'time'

Word should be interpreted as a time on the clock

WORD = 'word'

Interpret as regular word

class gruut.const.InterpretAsFormat(value)

Bases: str, enum.Enum

Supported options for format attribute of <say-as>

DATE_DMY = 'dmy'
DATE_DMY_ORDINAL = 'omy'
DATE_DM_ORDINAL = 'om'
DATE_MD = 'md'
DATE_MDY = 'mdy'
DATE_MDY_ORDINAL = 'moy'
DATE_MD_ORDINAL = 'mo'
DATE_MY = 'my'
DATE_Y = 'y'
DATE_YM = 'ym'
DATE_YMD = 'ymd'
DATE_YMD_ORDINAL = 'ymo'
NUMBER_CARDINAL = 'cardinal'

Cardinal version of number (1 = one)

NUMBER_DIGITS = 'digits'

Number as digits (12 = one two)

NUMBER_ORDINAL = 'ordinal'

Ordinal version of number (1 = first)

NUMBER_YEAR = 'year'

Number as a year (2021 = twenty twenty-one)

class gruut.const.Lexeme(grapheme='', phonemes=None, roles=None)

Bases: object

Entry of an inline lexicon

grapheme: str = ''
phonemes: Optional[Sequence[str]] = None
roles: Optional[Set[str]] = None
class gruut.const.LookupPhonemes

Bases: object

Look up phonemes for word/role in a lexicon

class gruut.const.MarkNode(node, element=None, voice='', lang='', implicit=False, name='')

Bases: gruut.const.Node

Represents a user-specified mark

name: str = ''

Name of the mark

class gruut.const.Node(node, element=None, voice='', lang='', implicit=False)

Bases: object

Base class of all text processing graph nodes

element: Optional[xml.etree.ElementTree.Element] = None
implicit: bool = False
lang: str = ''
node: int
voice: str = ''
class gruut.const.ParagraphNode(node, element=None, voice='', lang='', implicit=False)

Bases: gruut.const.Node

Represents a paragraph with SentenceNodes under it

node: int
class gruut.const.PostProcessSentence

Bases: object

Post-process each sentence node after tokenization/phonemization

class gruut.const.PunctuationWordNode(node, element=None, voice='', lang='', implicit=False, text='', text_with_ws='')

Bases: gruut.const.Node

Represents a punctuation marker in the text

text: str = ''
text_with_ws: str = ''
class gruut.const.SSMLParsingState(value)

Bases: int, enum.Enum

Current state of SSML parsing

DEFAULT = 0
IN_LEXICON = 2

Inside <lexicon>

IN_LEXICON_GRAPHEME = 3

Inside <lexicon><grapheme>…

IN_LEXICON_PHONEME = 4

Inside <lexicon><phoneme>…

IN_WORD = 1

Inside <w> or <token>

class gruut.const.Sentence(idx, text, text_with_ws, text_spoken, par_idx=0, lang='', voice='', words=<factory>, pause_before_ms=0, pause_after_ms=0, marks_before=None, marks_after=None)

Bases: object

Processed sentence from a document

idx: int

Zero-based index of sentence in paragraph

lang: str = ''

Language code

marks_after: Optional[List[str]] = None

User-defined marks that occur after this sentence

marks_before: Optional[List[str]] = None

User-defined marks that occur before this sentence

par_idx: int = 0

Zero-based index of paragraph in document

pause_after_ms: int = 0

Milliseconds to pause after this sentence

pause_before_ms: int = 0

Milliseconds to pause before this sentence

text: str

Text with normalized whitespace

text_spoken: str

Text with only spoken words and normalized whitespace

text_with_ws: str

Text with original whitespace

voice: str = ''

Voice (from SSML)

words: List[gruut.const.Word]

Words in the sentence

class gruut.const.SentenceNode(node, element=None, voice='', lang='', implicit=False)

Bases: gruut.const.Node

Represents a sentence with WordNodes under it

node: int
class gruut.const.SpeakNode(node, element=None, voice='', lang='', implicit=False)

Bases: gruut.const.Node

Top-level node for SSML

node: int
class gruut.const.TextProcessorSettings(lang, split_words=<function default_split_words>, join_str=' ', keep_whitespace=True, is_non_word=None, get_whitespace=<function default_get_whitespace>, normalize_whitespace=<function default_normalize_whitespace>, begin_punctuations=None, begin_punctuations_pattern=None, end_punctuations=None, end_punctuations_pattern=None, replacements=<factory>, abbreviations=<factory>, spell_out_words=<factory>, major_breaks=<factory>, major_breaks_pattern=None, minor_breaks=<factory>, minor_breaks_pattern=None, word_breaks=<factory>, word_breaks_pattern=None, is_maybe_number=<function has_digit>, get_ordinal=None, babel_locale=None, num2words_lang=None, default_currency='USD', currencies=<factory>, currency_symbols=<factory>, is_maybe_currency=<function has_digit>, dateparser_lang=None, is_maybe_date=<function has_digit>, default_date_format=InterpretAsFormat.DATE_MDY_ORDINAL, is_maybe_time=<function has_digit>, parse_time=None, verbalize_time=None, get_parts_of_speech=None, is_initialism=None, split_initialism=None, lookup_phonemes=None, guess_phonemes=None, pre_process_text=None, post_process_sentence=None)

Bases: object

Language specific settings for text processing

abbreviations: Dict[Union[str, re.Pattern], str]

Regex, replacement template pairs that may expand words after minor breaks are matched

babel_locale: Optional[str] = None

Locale used to parse numbers/dates/currencies (defaults to lang)

begin_punctuations: Optional[Set[str]] = None

Strings that should be split off from the beginning of a word

begin_punctuations_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides begin_punctuations

currencies: MutableMapping[str, str]

Mapping from currency symbol ($) to currency name (USD)

currency_symbols: Sequence[str]

Ordered list of currency symbols (decreasing length)

dateparser_lang: Optional[str] = None

Language used to parse dates (defaults to lang)

default_currency: str = 'USD'

Currency name to use when interpret-as=”currency” but no currency symbol is present

default_date_format: Union[str, gruut.const.InterpretAsFormat] = 'moy'

Format used to verbalize a date unless set with the format attribute of <say-as>

end_punctuations: Optional[Set[str]] = None

Strings that should be split off from the end of a word

end_punctuations_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides end_punctuations

get_ordinal: Optional[Callable[[str], Optional[int]]] = None

Returns integer value of an ordinal string (e.g., 1st -> 1) or None if not an ordinal

get_parts_of_speech: Optional[gruut.const.GetPartsOfSpeech] = None

Optional function to get part of speech for a word

get_whitespace()

Returns leading, trailing whitespace from a string

Return type

Tuple[str, str]

guess_phonemes: Optional[gruut.const.GuessPhonemes] = None

Optional function to guess phonemes for a word/role

is_initialism: Optional[Callable[[str], bool]] = None

True if a word is an initialism (will be split with split_initialism)

is_maybe_currency()

True if a word may be an amount of currency (parsing will be attempted)

Return type

bool

is_maybe_date()

True if a word may be a date (parsing will be attempted)

Return type

bool

is_maybe_number()

True if a word may be a number (parsing will be attempted)

Return type

bool

is_maybe_time()

True if a word may be a clock time (parsing will be attempted)

Return type

bool

is_non_word: Optional[Callable[[str], bool]] = None

Returns true if text is not a word (and should be ignored in final output)

join_str: str = ' '

String used to combine text from words

keep_whitespace: bool = True

True if original whitespace should be retained

lang: str

Language code that these settings apply to (e.g., en_US)

lookup_phonemes: Optional[gruut.const.LookupPhonemes] = None

Optional function to look up phonemes for a word/role (without guessing)

major_breaks: Set[str]

Set of strings that occur at the end of a word and should break apart sentences.

major_breaks_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides major_breaks

minor_breaks: Set[str]

Set of strings that occur at the end of a word and should break apart phrases.

minor_breaks_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides minor_breaks

normalize_whitespace()

Normalizes whitespace in a string

Return type

str

num2words_lang: Optional[str] = None

Language used to verbalize numbers (defaults to lang)

parse_time: Optional[Callable[[str], Optional[gruut.const.Time]]] = None

Parse word text into a Time object or None

post_process_sentence: Optional[gruut.const.PostProcessSentence] = None

Optional function to post-process each sentence in the graph before post_process_graph

pre_process_text: Optional[Callable[[str], str]] = None

Optional function to process text during tokenization

replacements: Sequence[Tuple[Union[str, re.Pattern], str]]

Regex, replacement template pairs that are applied in order right after tokenization on each word

spell_out_words: Dict[str, str]

Written form, spoken form pairs that are applied with interpret-as=”spell-out” in <say-as>

split_initialism: Optional[Callable[[str], Sequence[str]]] = None

Function to break apart an initialism into multiple words (called if is_initialism is True)

split_words()

Split text into words and separators

Return type

Iterable[str]

verbalize_time: Optional[Callable[[gruut.const.Time], Iterable[str]]] = None

Convert Time to words

word_breaks: Set[str]
word_breaks_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides word_breaks

class gruut.const.Time(hours, minutes=0, period=None)

Bases: object

Parsed time from text

hours: int
minutes: int = 0
period: Optional[str] = None

A.M. or P.M.

class gruut.const.Word(idx, text, text_with_ws, leading_ws='', trailing_ws='', sent_idx=0, par_idx=0, lang='', voice='', pos=None, phonemes=None, is_major_break=False, is_minor_break=False, is_punctuation=False, is_break=None, is_spoken=None, pause_before_ms=0, pause_after_ms=0, marks_before=None, marks_after=None)

Bases: object

Processed word from a Sentence

idx: int

Zero-based index of word in sentence

is_break: Optional[bool] = None

True if major or minor break

is_major_break: bool = False

True if word is a major break (separates sentences)

is_minor_break: bool = False

True if word is a minor break (separates phrases)

is_punctuation: bool = False

True if word is punctuation that surrounds a spoken word (quotes, etc.)

is_spoken: Optional[bool] = None

True if word is something that would be spoken during reading (not punctuation or break)

lang: str = ''

Language code

leading_ws: str = ''

Whitespace before text

marks_after: Optional[List[str]] = None

User-defined marks that occur after this word

marks_before: Optional[List[str]] = None

User-defined marks that occur before this word

par_idx: int = 0

Zero-based index of paragraph in document

pause_after_ms: int = 0

Milliseconds to pause after this word

pause_before_ms: int = 0

Milliseconds to pause before this word

phonemes: Optional[Sequence[str]] = None

List of phonemes (None if not set)

pos: Optional[str] = None

Part of speech (None if not set)

sent_idx: int = 0

Zero-based index of sentence in paragraph

text: str

Text with normalized whitespace

text_with_ws: str

Text with original whitespace

trailing_ws: str = ''

Whitespace after text

voice: str = ''

Voice (from SSML)

class gruut.const.WordNode(node, element=None, voice='', lang='', implicit=False, text='', text_with_ws='', interpret_as='', format='', number=None, date=None, currency_symbol=None, currency_name=None, time=None, role=WordRole.DEFAULT, pos=None, phonemes=None, in_lexicon=None, lexicon_ids=None, is_maybe_number=True, is_maybe_date=True, is_maybe_currency=True, is_maybe_time=True, is_from_broken_word=False)

Bases: gruut.const.Node

Represents a single word

currency_name: Optional[str] = None
currency_symbol: Optional[str] = None
date: Optional[datetime.datetime] = None
format: Union[str, gruut.const.InterpretAsFormat] = ''
in_lexicon: Optional[bool] = None
interpret_as: Union[str, gruut.const.InterpretAs] = ''
is_from_broken_word: bool = False
is_maybe_currency: bool = True
is_maybe_date: bool = True
is_maybe_number: bool = True
is_maybe_time: bool = True
lexicon_ids: Optional[Sequence[str]] = None
number: Optional[decimal.Decimal] = None
phonemes: Optional[Sequence[str]] = None
pos: Optional[str] = None
role: Union[str, gruut.const.WordRole] = ''
text: str = ''
text_with_ws: str = ''
time: Optional[gruut.const.Time] = None
class gruut.const.WordRole(value)

Bases: str, enum.Enum

Role of a word. Used to disambiguate pronunciations.

DEFAULT = ''

Use default word pronunciation

LETTER = 'gruut:letter'

Word should be pronounced as a letter (a = /eɪ/ instead of /ə/)

gruut.const.default_get_whitespace(s)

Returns leading and trailing whitespace of a string

Return type

Tuple[str, str]

gruut.const.default_normalize_whitespace(s)

Replace multiple spaces with single space

Return type

str

gruut.const.default_split_words(s)

Split text on whitespace

Return type

Iterable[str]

gruut.const.has_digit(s)

True if string contains at least one digit

Return type

bool

gruut.const.maybe_compile_regex(str_or_pattern)

Compile regex pattern if it’s a string

Return type

Pattern

gruut.corpus2db module

Converts a Phonetisaurus G2P corpus to an sqlite database

gruut.corpus2db.main()

Main entry point

gruut.g2p module

Grapheme to phoneme prediction using python CRF suite.

Training requires pre-aligned corpus in Phonetisaurus format. https://github.com/AdolfVonKleist/Phonetisaurus

The format of this corpus is:

t}t e}ˈɛ s}s t}t

Each line contains a single word, with graphemes and phonemes separated by “}”. Multiple graphemes are separated by “|”:

s|h}ʃ o|w}ˈoʊ

The empty phoneme is “_”:

w}w h}_ y}ˈaɪ

Example:

python3 -m gruut.g2p train --corpus g2p.corpus --output model.crf

Pre-trained models have the following settings:

  • c1 = 0

  • c2 = 1

  • max-iterations = 100

class gruut.g2p.GraphemesToPhonemes(crf_tagger, eps_phoneme='_', phoneme_join='|')

Bases: object

Grapheme to phoneme CRF tagger

static decode_string(s)

Decodes a string encoded by encode_string

Return type

str

static encode_string(s)

Encodes string in a form that crfsuite will accept (ASCII) and can be decoded

Return type

str

static grapheme2features(word, i, add_begin=True, add_end=True, chars_backward=3, chars_forward=3, bias=1.0, encode=True)

Create feature dict for single grapheme

Return type

Dict[str, Union[str, bool, int, float]]

static word2features(word, normalize=True, **kwargs)

Create feature dicts for all graphemes in a word

gruut.g2p.do_predict(args)

CLI method for predict

gruut.g2p.do_test(args)

CLI method for test

gruut.g2p.do_train(args)

CLI method for train

gruut.g2p.main()

Main entry point

gruut.g2p.train(corpus_path, output_path, group_separator='}', item_separator='|', phoneme_join='|', eps_phoneme='_', remove_phonemes=None, c1=0.0, c2=1.0, max_iterations=100)

Train a new G2P model

gruut.g2p_phonetisaurus module

Guess word pronunciations using a Phonetisaurus FST

See bin/fst2npz.py to convert an FST to a numpy graph.

class gruut.g2p_phonetisaurus.PhonetisaurusGraph(graph, preload=False)

Bases: object

Graph of numpy arrays that represents a Phonetisaurus FST

Also contains shared cache of edges and final state probabilities. These caches are necessary to ensure that the .npz file stays small and fast to load.

g2p(words, **kwargs)

Guess phonemes for words

Return type

Iterable[Tuple[Union[str, Sequence[str]], Sequence[str], Sequence[str]]]

g2p_one(word, eps='<eps>', beam=5000, min_beam=100, beam_scale=0.6, grapheme_separator='', max_guesses=1)

Guess phonemes for word

Return type

Iterable[Tuple[Sequence[str], Sequence[str]]]

static load(graph_path, **kwargs)

Load .npz file with numpy graph

Return type

PhonetisaurusGraph

gruut.g2p_phonetisaurus.do_predict(args)

Predict phonemes for words

gruut.g2p_phonetisaurus.do_test(args)

Test performance relative a known lexicon

gruut.g2p_phonetisaurus.main()

Main entry point

gruut.lang module

Language-specific settings

class gruut.lang.ArabicPreProcessText

Bases: object

Pre-processes text using mishkal

class gruut.lang.DelayedGraphemesToPhonemes(model_path, transform_func=None, **g2p_args)

Bases: object

Grapheme to phoneme guesser that loads on first use

class gruut.lang.DelayedPartOfSpeechTagger(model_path, **tagger_args)

Bases: object

POS tagger that loads on first use

class gruut.lang.DelayedSqlitePhonemizer(db_path, **phonemizer_args)

Bases: object

Phonemizer that loads on first use

class gruut.lang.FarsiPartOfSpeechTagger(lang_dir)

Bases: object

Add POS tags with hazm

gruut.lang.en_get_ordinal(text)

Parse English ordinal string (e.g., 1st -> 1)

Return type

Optional[int]

gruut.lang.en_is_initialism(text)

True if text is of the form TTS or T.T.S.

Return type

bool

gruut.lang.en_parse_time(text)

Parse English clock time (e.g. 4:01pm)

Return type

Optional[Time]

gruut.lang.en_verbalize_time(time)

Convert time into words

Return type

Iterable[str]

gruut.lang.fa_post_process_sentence(graph, sent_node, settings)

Add e̞ for genitive case

gruut.lang.fr_has_silent_consonant(last_char, last_phoneme)

True if last consonant is silent in French

Return type

bool

gruut.lang.fr_is_vowel(phoneme)

True if phoneme is a French vowel

Return type

bool

gruut.lang.fr_post_process_sentence(graph, sent_node, settings)

Add liasons to phonemes

gruut.lang.get_ar_settings(lang_dir=None, **settings_args)

Create settings for Arabic

Return type

TextProcessorSettings

gruut.lang.get_cs_settings(lang_dir=None, **settings_args)

Create settings for Czech

Return type

TextProcessorSettings

gruut.lang.get_de_settings(lang_dir=None, **settings_args)

Create settings for German

Return type

TextProcessorSettings

gruut.lang.get_en_us_settings(lang_dir=None, **settings_args)

Create settings for English

Return type

TextProcessorSettings

gruut.lang.get_es_settings(lang_dir=None, **settings_args)

Create settings for Spanish

Return type

TextProcessorSettings

gruut.lang.get_fa_settings(lang_dir=None, **settings_args)

Create settings for Farsi

Return type

TextProcessorSettings

gruut.lang.get_fr_settings(lang_dir=None, **settings_args)

Create settings for French

Return type

TextProcessorSettings

gruut.lang.get_it_settings(lang_dir=None, **settings_args)

Create settings for Italian

Return type

TextProcessorSettings

gruut.lang.get_nl_settings(lang_dir=None, **settings_args)

Create settings for Dutch

Return type

TextProcessorSettings

gruut.lang.get_pt_settings(lang_dir=None, **settings_args)

Create default settings for Portuguese

Return type

TextProcessorSettings

gruut.lang.get_ru_settings(lang_dir=None, **settings_args)

Create settings for Russian

Return type

TextProcessorSettings

gruut.lang.get_settings(lang, search_dirs=None, lang_dir=None, model_prefix=None, load_pos_tagger=True, load_phoneme_lexicon=True, load_g2p_guesser=True, **settings_args)

Get settings for a specific language

Return type

TextProcessorSettings

gruut.lang.get_sv_settings(lang_dir=None, **settings_args)

Create settings for Swedish

Return type

TextProcessorSettings

gruut.lang.get_sw_settings(lang_dir=None, **settings_args)

Create settings for Swahili

Return type

TextProcessorSettings

gruut.lang.get_zh_settings(lang_dir=None, **settings_args)

Create settings for Chinese

Return type

TextProcessorSettings

gruut.lexicon2db module

Converts a text lexicon to a gruut sqlite3 database

gruut.lexicon2db.main()

Main entry point

gruut.phonemize module

Class for getting phonetic pronunciations for tokenized text

class gruut.phonemize.SqlitePhonemizer(db_conn, lexicon=None, g2p_model=None, word_transform_funcs=None, casing_func=None)

Bases: object

Phonemizes text using a lexicon from a sqlite database

DEFAULT_ROLE: str = ''

gruut.pos module

Part of speech tagging using python CRF suite.

Credit to: https://towardsdatascience.com/pos-tagging-using-crfs-ea430c5fb78b

Training requires conllu package:

pip install conllu

Training data comes from Univeral Dependencies (https://universaldependencies.org/)

Example:

python3 -m gruut.pos train --conllu train.conllu --output model.crf --label xpos

Pre-trained models have the following settings:

  • c1 = 0.25

  • c2 = 0.3

  • max-iterations = 100

English model is trained with “xpos” label. French model is trained with “upos” label.

class gruut.pos.PartOfSpeechTagger(crf_tagger, **kwargs)

Bases: object

Part of speech tagger using a pre-trained CRF model

static decode_string(s)

Decodes a string encoded by encode_string

Return type

str

static encode_string(s)

Encodes string in a form that crfsuite will accept (ASCII) and can be decoded

Return type

str

static local_features(word, prefix='', bias=1.0, add_punctuation=True, add_digit=True, add_length=True, chars_front=3, chars_back=3, encode=True)

Get features for a single word

Return type

Dict[str, Union[str, bool, int, float, Sequence[str]]]

static sent2features(sentence, **kwargs)

Get features for all words in a sentence

Return type

List[Dict[str, Union[str, bool, int, float, Sequence[str]]]]

static word2features(sentence, i, add_bos=True, add_eos=True, words_backward=2, words_forward=2, **kwargs)

Get features for a word and surrounding context

Return type

Dict[str, Union[str, bool, int, float, Sequence[str]]]

gruut.pos.do_predict(args)

CLI method for predict

gruut.pos.do_print_labels(args)

Print label set from a CONLLU file

gruut.pos.do_test(args)

CLI method for testing

gruut.pos.do_train(args)

CLI method for train_model

gruut.pos.main()

Main entry point

gruut.pos.train_model(conllu_path, output_path, label='xpos', c1=0.25, c2=0.3, max_iterations=100)

Train a new model from CONLLU data

gruut.text_processor module

Tokenizes, verbalizes, and phonemizes text and SSML

class gruut.text_processor.TextProcessor(default_lang='en_US', model_prefix='', lang_dirs=None, search_dirs=None, settings=None, **kwargs)

Bases: object

Tokenizes, verbalizes, and phonemizes text and SSML

get_settings(lang=None)

Gets or creates settings for a language

Return type

TextProcessorSettings

post_process_graph(graph, root)

User-defined post-processing of entire graph

process(text, lang=None, ssml=False, pos=True, phonemize=True, post_process=True, add_speak_tag=True, detect_numbers=True, detect_currency=True, detect_dates=True, detect_times=True, verbalize_numbers=True, verbalize_currency=True, verbalize_dates=True, verbalize_times=True, max_passes=5)

Processes text or SSML

Parameters
  • text (str) – input text or SSML (ssml=True)

  • lang (Optional[str]) – default language of input text

  • ssml (bool) – True if input text is SSML

  • pos (bool) – False if part of speech tagging should be disabled

  • phonemize (bool) – False if phonemization should be disabled

  • post_process (bool) – False if sentence/graph post-processing should be disabled

  • add_speak_tag (bool) – True if <speak> should be automatically added to input text when ssml=True

  • detect_numbers (bool) – True if numbers should be annotated in text (interpret_as=”number”)

  • detect_currency (bool) – True if currency amounts should be annotated in text (interpret_as=”currency”)

  • detect_dates (bool) – True if dates should be annotated in text (interpret_as=”date”)

  • detect_times (bool) – True if clock times should be annotated in text (interpret_as=”time”)

  • verbalize_numbers (bool) – True if annotated numbers should be expanded into words

  • verbalize_currency (bool) – True if annotated currency amounts should be expanded into words

  • verbalize_dates (bool) – True if annotated dates should be expanded into words

  • verbalize_times (bool) – True if annotated clock times should be expanded into words

Returns

text graph and root node

Return type

graph, root

sentences(graph, root, major_breaks=True, minor_breaks=True, punctuations=True, explicit_lang=True, phonemes=True, break_phonemes=True, pos=True)

Processes text and returns each sentence

Return type

Iterable[Sentence]

words(graph, root, **kwargs)

Processes text and returns each word

Return type

Iterable[Word]

gruut.utils module

Utility methods for gruut

gruut.utils.attrib_no_namespace(element, name, default=None)

Search for an attribute by key without namespaces

Return type

Any

gruut.utils.find_lang_dir(lang, search_dirs=None)

Search for a language’s model directory by name.

Tries to find a directory by:

  1. Importing a module name gruut_lang_<short_lang> where short_lang is “en” for “en-us”, etc.

  2. Looking for <lang>/lexicon.db in each directory in order:

    • search_dirs

    • $XDG_CONFIG_HOME/gruut

    • A “data” directory next to the gruut module

Parameters
  • lang (str) – Full language name (e.g., en-us)

  • search_dirs (Optional[Iterable[Union[str, Path]]]) – Optional iterable of directory paths to search first

Return type

Optional[Path]

Returns

Path to the language model directory or None if it can’t be found

gruut.utils.get_currency_names(locale_str)

Try to get currency names and symbols for a Babel locale.

Return type

Dict[str, str]

Returns

Dictionary whose keys are currency symbols (like “$”) and whose values are currency names (like “USD”)

gruut.utils.grouper(iterable, n, fillvalue=None)

Collect data into fixed-length chunks or blocks

gruut.utils.leaves(graph, node)

Iterate through the leaves of a graph in depth-first order

gruut.utils.maybe_split_ipa(s)

Split on whitespace if a space is present, otherwise return string as list of graphemes

Return type

List[str]

gruut.utils.pairwise(iterable)

s -> (s0,s1), (s1,s2), (s2, s3), …

gruut.utils.pipeline_split(split_func, graph, parent_node)

Splits leaf nodes of tree into zero or more sub-nodes

Return type

bool

gruut.utils.pipeline_transform(transform_func, graph, parent_node)

Transforms leaves of tree with a custom function

Return type

bool

gruut.utils.print_graph(graph, node, indent='--', level=1, print_func=<built-in function print>)

Prints a graph to the console

gruut.utils.remove_non_word_chars(s)

Removes non-word characters from a string

Return type

str

gruut.utils.resolve_lang(lang)

Try to resolve language using aliases.

Parameters

lang (str) – Language name or alias

Return type

str

Returns

Resolved language name

gruut.utils.sliding_window(iterable, n=2)

Returns a sliding window of size n over an iterable

gruut.utils.tag_no_namespace(tag)

Remove namespace from XML tag

Return type

str

gruut.utils.text_and_elements(element, is_last=False)

Yields element, text, sub-elements, end element, and tail

Module contents

gruut module

class gruut.TextProcessor(default_lang='en_US', model_prefix='', lang_dirs=None, search_dirs=None, settings=None, **kwargs)

Bases: object

Tokenizes, verbalizes, and phonemizes text and SSML

get_settings(lang=None)

Gets or creates settings for a language

Return type

TextProcessorSettings

post_process_graph(graph, root)

User-defined post-processing of entire graph

process(text, lang=None, ssml=False, pos=True, phonemize=True, post_process=True, add_speak_tag=True, detect_numbers=True, detect_currency=True, detect_dates=True, detect_times=True, verbalize_numbers=True, verbalize_currency=True, verbalize_dates=True, verbalize_times=True, max_passes=5)

Processes text or SSML

Parameters
  • text (str) – input text or SSML (ssml=True)

  • lang (Optional[str]) – default language of input text

  • ssml (bool) – True if input text is SSML

  • pos (bool) – False if part of speech tagging should be disabled

  • phonemize (bool) – False if phonemization should be disabled

  • post_process (bool) – False if sentence/graph post-processing should be disabled

  • add_speak_tag (bool) – True if <speak> should be automatically added to input text when ssml=True

  • detect_numbers (bool) – True if numbers should be annotated in text (interpret_as=”number”)

  • detect_currency (bool) – True if currency amounts should be annotated in text (interpret_as=”currency”)

  • detect_dates (bool) – True if dates should be annotated in text (interpret_as=”date”)

  • detect_times (bool) – True if clock times should be annotated in text (interpret_as=”time”)

  • verbalize_numbers (bool) – True if annotated numbers should be expanded into words

  • verbalize_currency (bool) – True if annotated currency amounts should be expanded into words

  • verbalize_dates (bool) – True if annotated dates should be expanded into words

  • verbalize_times (bool) – True if annotated clock times should be expanded into words

Returns

text graph and root node

Return type

graph, root

sentences(graph, root, major_breaks=True, minor_breaks=True, punctuations=True, explicit_lang=True, phonemes=True, break_phonemes=True, pos=True)

Processes text and returns each sentence

Return type

Iterable[Sentence]

words(graph, root, **kwargs)

Processes text and returns each word

Return type

Iterable[Word]

class gruut.TextProcessorSettings(lang, split_words=<function default_split_words>, join_str=' ', keep_whitespace=True, is_non_word=None, get_whitespace=<function default_get_whitespace>, normalize_whitespace=<function default_normalize_whitespace>, begin_punctuations=None, begin_punctuations_pattern=None, end_punctuations=None, end_punctuations_pattern=None, replacements=<factory>, abbreviations=<factory>, spell_out_words=<factory>, major_breaks=<factory>, major_breaks_pattern=None, minor_breaks=<factory>, minor_breaks_pattern=None, word_breaks=<factory>, word_breaks_pattern=None, is_maybe_number=<function has_digit>, get_ordinal=None, babel_locale=None, num2words_lang=None, default_currency='USD', currencies=<factory>, currency_symbols=<factory>, is_maybe_currency=<function has_digit>, dateparser_lang=None, is_maybe_date=<function has_digit>, default_date_format=InterpretAsFormat.DATE_MDY_ORDINAL, is_maybe_time=<function has_digit>, parse_time=None, verbalize_time=None, get_parts_of_speech=None, is_initialism=None, split_initialism=None, lookup_phonemes=None, guess_phonemes=None, pre_process_text=None, post_process_sentence=None)

Bases: object

Language specific settings for text processing

abbreviations: Dict[Union[str, re.Pattern], str]

Regex, replacement template pairs that may expand words after minor breaks are matched

babel_locale: Optional[str] = None

Locale used to parse numbers/dates/currencies (defaults to lang)

begin_punctuations: Optional[Set[str]] = None

Strings that should be split off from the beginning of a word

begin_punctuations_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides begin_punctuations

currencies: MutableMapping[str, str]

Mapping from currency symbol ($) to currency name (USD)

currency_symbols: Sequence[str]

Ordered list of currency symbols (decreasing length)

dateparser_lang: Optional[str] = None

Language used to parse dates (defaults to lang)

default_currency: str = 'USD'

Currency name to use when interpret-as=”currency” but no currency symbol is present

default_date_format: Union[str, gruut.const.InterpretAsFormat] = 'moy'

Format used to verbalize a date unless set with the format attribute of <say-as>

end_punctuations: Optional[Set[str]] = None

Strings that should be split off from the end of a word

end_punctuations_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides end_punctuations

get_ordinal: Optional[Callable[[str], Optional[int]]] = None

Returns integer value of an ordinal string (e.g., 1st -> 1) or None if not an ordinal

get_parts_of_speech: Optional[gruut.const.GetPartsOfSpeech] = None

Optional function to get part of speech for a word

get_whitespace()

Returns leading, trailing whitespace from a string

Return type

Tuple[str, str]

guess_phonemes: Optional[gruut.const.GuessPhonemes] = None

Optional function to guess phonemes for a word/role

is_initialism: Optional[Callable[[str], bool]] = None

True if a word is an initialism (will be split with split_initialism)

is_maybe_currency()

True if a word may be an amount of currency (parsing will be attempted)

Return type

bool

is_maybe_date()

True if a word may be a date (parsing will be attempted)

Return type

bool

is_maybe_number()

True if a word may be a number (parsing will be attempted)

Return type

bool

is_maybe_time()

True if a word may be a clock time (parsing will be attempted)

Return type

bool

is_non_word: Optional[Callable[[str], bool]] = None

Returns true if text is not a word (and should be ignored in final output)

join_str: str = ' '

String used to combine text from words

keep_whitespace: bool = True

True if original whitespace should be retained

lang: str

Language code that these settings apply to (e.g., en_US)

lookup_phonemes: Optional[gruut.const.LookupPhonemes] = None

Optional function to look up phonemes for a word/role (without guessing)

major_breaks: Set[str]

Set of strings that occur at the end of a word and should break apart sentences.

major_breaks_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides major_breaks

minor_breaks: Set[str]

Set of strings that occur at the end of a word and should break apart phrases.

minor_breaks_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides minor_breaks

normalize_whitespace()

Normalizes whitespace in a string

Return type

str

num2words_lang: Optional[str] = None

Language used to verbalize numbers (defaults to lang)

parse_time: Optional[Callable[[str], Optional[gruut.const.Time]]] = None

Parse word text into a Time object or None

post_process_sentence: Optional[gruut.const.PostProcessSentence] = None

Optional function to post-process each sentence in the graph before post_process_graph

pre_process_text: Optional[Callable[[str], str]] = None

Optional function to process text during tokenization

replacements: Sequence[Tuple[Union[str, re.Pattern], str]]

Regex, replacement template pairs that are applied in order right after tokenization on each word

spell_out_words: Dict[str, str]

Written form, spoken form pairs that are applied with interpret-as=”spell-out” in <say-as>

split_initialism: Optional[Callable[[str], Sequence[str]]] = None

Function to break apart an initialism into multiple words (called if is_initialism is True)

split_words()

Split text into words and separators

Return type

Iterable[str]

verbalize_time: Optional[Callable[[gruut.const.Time], Iterable[str]]] = None

Convert Time to words

word_breaks: Set[str]
word_breaks_pattern: Optional[Union[str, re.Pattern]] = None

Regex that overrides word_breaks

gruut.get_supported_languages()

Set of supported gruut languages

Return type

Set[str]

gruut.is_language_supported(lang)

True if gruut supports lang

Return type

bool

gruut.sentences(text, lang='en_US', ssml=False, espeak=False, major_breaks=True, minor_breaks=True, punctuations=True, explicit_lang=True, phonemes=True, break_phonemes=True, pos=True, **process_args)

Process text and return sentences

Parameters
  • text (str) – input text or SSML (ssml=True)

  • lang (str) – default language of input text

  • ssml (bool) – True if input text is SSML

  • espeak (bool) – True if eSpeak phonemes should be used

  • major_breaks (bool) – False if no sentence-breaking symbols in output

  • minor_breaks (bool) – False if no phrase-breaking symbols in output

  • punctuations (bool) – False if no word-surrounding symbols in output

  • **process_args – keyword arguments passed to TextProcessor.process

Returns

iterable of Sentence objects

Return type

sentences