Internals of Text Processing

generic text processing engine.

Support for text processing:

  • converting to UTF-8

  • converting from UTF-8

  • striping

  • splitting

  • concatenating

  • text_cvt

  • seq_val_t

Define:

1. NUMBER_OF_CHARS:

1. Yap_strlen:

1. Yap_chtype:

1. __android_log_print:

1. YAP_TYPE_MASK:

Functions:

1. char_kind_t Yap_wide_chtype(int ch):

< Other, not assigned

< Letter, uppercase

< Letter, lowercase

< Letter, titlecase

< Letter, modifier

< Letter, other

< Mark, nonspacing

< Mark, spacing combining

< Mark, enclosing

< Number, decimal digit

< Number, letter

< Number, other

< Punctuation, connector

< Punctuation, dash

< Punctuation, open

< Punctuation, close

< Punctuation, initial quote

< Punctuation, final quote

< Punctuation, other

< Symbol, math

< Symbol, currency

< Symbol, modifier

unsure in YAP, let's assume a,c us treated as aƧ

< Symbol, other

< Separator, space

< Separator, line

< Separator, paragraph

< Other, control

< Other, format

< Other, surrogate

< Other, private use

1. static char_kind_t chtype(Int ch):

1. const char * Yap_tokText(void *tokptr):

1. Term Yap_tokRep(void *tokptr): : represent token *tokptr in string s, maxlength is sz-1

conversion is based on token type.

represent token *tokptr in string s, maxlength is sz-1

1. Term Yap_tokFullRep(void *tokptr):

1. static seq_type_t mod_to_type(int quote, Term mod USES_REGS):

1. static seq_type_t Yap_TextType(Term t):

1. unsigned char * Yap_readText(seq_tv_t *inp USES_REGS):

1. bool write_Text(unsigned char inp, seq_tv_t out USES_REGS):

1. bool Yap_CVT_Text(seq_tv_t inp, seq_tv_t out USES_REGS):

1. bool Yap_Concat_Text(int n, seq_tv_t inp[], seq_tv_t *out USES_REGS):

1. bool Yap_Splice_Text(int n, ssize_t cuts[], seq_tv_t *inp, seq_tv_t outv[] USES_REGS):

1. unsigned char * Yap_ListOfCodesToBuffer(unsigned char buf, Term t, seq_tv_t inp USES_REGS):

1. unsigned char * Yap_ListOfCharsToBuffer(unsigned char buf, Term t, seq_tv_t inp USES_REGS):

1. static Atom Yap_AtomicToLowAtom(Term t0 USES_REGS):

1. static Atom Yap_AtomicToUpAtom(Term t0 USES_REGS):

1. static Term Yap_AtomicToLowString(Term t0 USES_REGS):

1. static Term Yap_AtomicToUpString(Term t0 USES_REGS):

1. static Term Yap_AtomicToLowListOfCodes(Term t0 USES_REGS):

1. static Term Yap_AtomicToUpListOfCodes(Term t0 USES_REGS):

1. static Term Yap_AtomicToLowListOfAtoms(Term t0 USES_REGS):

1. static Term Yap_AtomicToUpListOfAtoms(Term t0 USES_REGS):

1. static size_t Yap_AtomicToUnicodeLength(Term t0 USES_REGS):

1. static Term Yap_AtomicToListOfAtoms(Term t0 USES_REGS):

1. static Term Yap_AtomicToListOfCodes(Term t0 USES_REGS):

1. static Atom Yap_AtomicToAtom(Term t0 USES_REGS):

1. static size_t Yap_AtomToLength(Term t0 USES_REGS):

1. static size_t Yap_AtomToUnicodeLength(Term t0 USES_REGS):

1. static Term Yap_AtomToListOfAtoms(Term t0 USES_REGS):

1. static Term Yap_AtomSWIToListOfAtoms(Term t0 USES_REGS):

1. static Term Yap_AtomToListOfCodes(Term t0 USES_REGS):

1. static Term Yap_AtomSWIToListOfCodes(Term t0 USES_REGS):

1. static Term Yap_AtomToNumber(Term t0 USES_REGS):

1. static Term Yap_AtomToString(Term t0 USES_REGS):

1. static Term Yap_AtomSWIToString(Term t0 USES_REGS):

1. static Term Yap_AtomicToString(Term t0 USES_REGS):

1. static wchar_t * Yap_AtomToWide(Atom at USES_REGS):

1. static Term Yap_AtomicToTBQ(Term t0, Term mod USES_REGS):

1. static Atom Yap_CharsToAtom(const char *s, encoding_t enc USES_REGS):

1. static Term Yap_CharsToListOfAtoms(const char *s, encoding_t enc USES_REGS):

1. static Term Yap_CharsToListOfCodes(const char *s, encoding_t enc USES_REGS):

1. static Term Yap_UTF8ToListOfCodes(const char *s USES_REGS):

1. static Atom Yap_UTF8ToAtom(const unsigned char *s USES_REGS):

1. static Term Yap_CharsToDiffListOfCodes(const char *s, Term tail, encoding_t enc USES_REGS):

1. static Term Yap_UTF8ToDiffListOfCodes(const unsigned char *s, Term tail USES_REGS):

1. static Term Yap_UTF8ToDiffListOfChars(const unsigned char *s, Term tail USES_REGS):

1. static Term Yap_WCharsToDiffListOfCodes(const wchar_t *s, Term tail USES_REGS):

1. static Term Yap_CharsToString(const char *s, encoding_t enc USES_REGS):

1. static char * Yap_AtomToUTF8Text(Atom at USES_REGS):

1. static Term Yap_QuotedToTerm(int quote, const char *s, Term mod, encoding_t enc USES_REGS):

1. static Atom Yap_ListOfAtomsToAtom(Term t0 USES_REGS):

1. static Term Yap_ListOfAtomsToNumber(Term t0 USES_REGS):

1. static Term Yap_ListOfAtomsToString(Term t0 USES_REGS):

1. static Atom Yap_ListOfCodesToAtom(Term t0 USES_REGS):

1. static Term Yap_ListOfCodesToNumber(Term t0 USES_REGS):

1. static Term Yap_ListOfCodesToString(Term t0 USES_REGS):

1. static Atom Yap_ListToAtom(Term t0 USES_REGS):

1. static Term Yap_ListToAtomic(Term t0 USES_REGS):

1. static Term Yap_ListToNumber(Term t0 USES_REGS):

1. static Term Yap_ListToString(Term t0 USES_REGS):

1. static Term Yap_ListSWIToString(Term t0 USES_REGS):

1. static Atom Yap_NCharsToAtom(const char *s, size_t len, encoding_t enc USES_REGS):

1. static Term Yap_CharsToDiffListOfAtoms(const char *s, encoding_t enc, Term tail USES_REGS):

1. static Term Yap_NCharsToListOfCodes(const char *s, size_t len, encoding_t enc USES_REGS):

1. static Term Yap_NCharsToString(const char *s, size_t len, encoding_t enc USES_REGS):

1. static Atom Yap_NumberToAtom(Term t0 USES_REGS):

1. static Term Yap_NumberToListOfAtoms(Term t0 USES_REGS):

1. static Term Yap_NumberToListOfCodes(Term t0 USES_REGS):

1. static Term Yap_NumberToString(Term t0 USES_REGS):

1. static Atom Yap_NWCharsToAtom(const wchar_t *s, size_t len USES_REGS):

1. static Term Yap_NWCharsToListOfAtoms(const wchar_t *s, size_t len USES_REGS):

1. static Term Yap_NWCharsToListOfCodes(const wchar_t *s, size_t len USES_REGS):

1. static Term Yap_NWCharsToString(const wchar_t *s, size_t len USES_REGS):

1. static Atom Yap_StringToAtom(Term t0 USES_REGS):

1. static Atom Yap_StringSWIToAtom(Term t0 USES_REGS):

1. static Term Yap_StringToAtomic(Term t0 USES_REGS):

1. static size_t Yap_StringToUnicodeLength(Term t0 USES_REGS):

1. static size_t Yap_StringToListOfAtoms(Term t0 USES_REGS):

1. static size_t Yap_StringSWIToListOfAtoms(Term t0 USES_REGS):

1. static size_t Yap_StringToListOfCodes(Term t0 USES_REGS):

1. static size_t Yap_StringSWIToListOfCodes(Term t0 USES_REGS):

1. static Term Yap_StringToNumber(Term t0 USES_REGS):

1. static Atom Yap_TextToAtom(Term t0 USES_REGS):

1. static Term Yap_TextToString(Term t0 USES_REGS):

1. static void Yap_OverwriteUTF8BufferToLowCase(void *buf USES_REGS):

1. static char * Yap_TextTermToText(Term t0 USES_REGS): : Function to convert a generic text term (string, atom, list of codes, list of< atoms) into a buff er.

tthe term bufthe buffer, if NULL a buffer is malloced, and the user should reclai it lenbuffer size encencoding (UTF-8 is strongly recommended) return:
the buffer, or NULL in case of failure. If so, Yap_Error may be called.

notice that it must be called from a push memory.

1. static const unsigned char * Yap_TextToUTF8Buffer(Term t0 USES_REGS):

1. static Term Yap_UTF8ToString(const char *s USES_REGS):

1. static Atom UTF32ToAtom(const wchar_t *s USES_REGS):

1. static Term Yap_WCharsToListOfCodes(const wchar_t *s USES_REGS):

1. static Term Yap_WCharsToString(const wchar_t *s USES_REGS):

1. static Atom Yap_ConcatAtoms(Term t1, Term t2 USES_REGS):

1. static Atom Yap_ConcatAtomics(Term t1, Term t2 USES_REGS):

1. static Term Yap_ConcatStrings(Term t1, Term t2 USES_REGS):

1. static Atom Yap_SpliceAtom(Term t1, Atom ats[], size_t cut, size_t max USES_REGS):

1. static Atom Yap_SubtractHeadAtom(Term t1, Term th USES_REGS):

1. static Atom Yap_SubtractTailAtom(Term t1, Term th USES_REGS):

1. static Term Yap_SpliceString(Term t1, Term ts[], size_t cut, size_t max USES_REGS):

1. static Term Yap_SubtractHeadString(Term t1, Term th USES_REGS):

1. static Term Yap_SubtractTailString(Term t1, Term th USES_REGS):

1. static Term Yap_MkTextTerm(const char *s, seq_type_t guide USES_REGS): : Convert from a text buffer (8-bit) to a term that has the same type as Tguide

ā‰ˆ* sthe buffer ā‰ˆ * tguidethe guide

ā‰ˆ * return:
the term

Var:

1. char* Yap_chtype:

1. char_kind_t Yap_chtype0[][]:

Typedef:

1. typedef UInt seq_type_t:

1. typedef struct text_cvt seq_tv_t:

Enum:

1. char_kind_t:

1. enum_seq_type_t: