[arch-commits] Commit in libtextcat/trunk (4 files)

andyrtr at archlinux.org andyrtr at archlinux.org
Sat May 7 14:06:49 UTC 2011


    Date: Saturday, May 7, 2011 @ 10:06:48
  Author: andyrtr
Revision: 122957

upgpkg: libtextcat 2.2-8
adopt patches from Fedora/Debian to make it usable for LibreOffice

Added:
  libtextcat/trunk/fpdb.conf
  libtextcat/trunk/libtextcat-2.2-OOo.patch
  libtextcat/trunk/libtextcat-2.2-exportapi.patch
Modified:
  libtextcat/trunk/PKGBUILD

--------------------------------+
 PKGBUILD                       |   23 +
 fpdb.conf                      |   86 +++++
 libtextcat-2.2-OOo.patch       |  634 +++++++++++++++++++++++++++++++++++++++
 libtextcat-2.2-exportapi.patch |  305 ++++++++++++++++++
 4 files changed, 1042 insertions(+), 6 deletions(-)

Modified: PKGBUILD
===================================================================
--- PKGBUILD	2011-05-07 13:26:42 UTC (rev 122956)
+++ PKGBUILD	2011-05-07 14:06:48 UTC (rev 122957)
@@ -6,18 +6,28 @@
 
 pkgname=libtextcat
 pkgver=2.2
-pkgrel=7
+pkgrel=8
 pkgdesc="Library that implements N-gram-based text categorization"
 arch=('i686' 'x86_64')
 url="http://software.wise-guys.nl/libtextcat/"
 license=('BSD')
 depends=('glibc')
 options=(!libtool)
-source=(http://software.wise-guys.nl/download/${pkgname}-${pkgver}.tar.gz)
-md5sums=('128cfc86ed5953e57fe0f5ae98b62c2e')
+source=(http://software.wise-guys.nl/download/${pkgname}-${pkgver}.tar.gz
+	# from http://hg.services.openoffice.org/hg/DEV300/raw-file/tip/libtextcat/data/new_fingerprints/fpdb.conf
+	fpdb.conf
+	libtextcat-2.2-exportapi.patch
+	libtextcat-2.2-OOo.patch)
+md5sums=('128cfc86ed5953e57fe0f5ae98b62c2e'
+         'f4fafe97d3aa184f5476e4918dba045d'
+         '4c46fcb825ec13e9f7ae3728f5f4c834'
+         '1d5f1026392365c58f7a7406e923f886')
 
 build() {
   cd ${srcdir}/${pkgname}-${pkgver}
+  patch -Np1 -i ${srcdir}/libtextcat-2.2-exportapi.patch
+  patch -Np1 -i ${srcdir}/libtextcat-2.2-OOo.patch
+  autoreconf -fi
   ./configure --prefix=/usr \
     --disable-static
   make
@@ -26,9 +36,10 @@
 package() {
   cd ${srcdir}/${pkgname}-${pkgver}
   make DESTDIR=${pkgdir} install
-
-  install -D -m644 src/textcat.h ${pkgdir}/usr/include/textcat.h
-  mkdir -p ${pkgdir}/usr/share/libtextcat/{LM,ShortTexts}
+#  install -D -m644 src/textcat.h ${pkgdir}/usr/include/textcat.h
+  
+  install -dm755 ${pkgdir}/usr/share/libtextcat/{LM,ShortTexts}
+  install -m644 ${srcdir}/fpdb.conf ${pkgdir}/usr/share/libtextcat
   install -m644 langclass/conf.txt ${pkgdir}/usr/share/libtextcat
   install -m644 langclass/LM/*.lm ${pkgdir}/usr/share/libtextcat/LM
   install -m644 langclass/ShortTexts/*.txt ${pkgdir}/usr/share/libtextcat/ShortTexts

Added: fpdb.conf
===================================================================
--- fpdb.conf	                        (rev 0)
+++ fpdb.conf	2011-05-07 14:06:48 UTC (rev 122957)
@@ -0,0 +1,86 @@
+#
+# A sample config file for the language models
+# provided with Gertjan van Noords language guesser
+# (http://odur.let.rug.nl/~vannoord/TextCat/)
+#
+# Notes:
+# - You may consider eliminating a couple of small languages from this
+# list because they cause false positives with big languages and are
+# bad for performance. (Do you really want to recognize Drents?)
+# - Putting the most probable languages at the top of the list
+# improves performance, because this will raise the threshold for
+# likely candidates more quickly.
+#
+
+# this file have been modified (to OOo by Jocelyn MERAND joc.mer at gmail.com) to include country and encoding
+# guess strings are made as following : language-country-encoding
+
+afrikaans.lm                         af--utf8
+albanian.lm                          sq--utf8
+amharic_utf.lm                       am--utf8
+arabic.lm                            ar--utf8
+basque.lm                            eu--utf8
+belarus.lm                           be--utf8
+bosnian.lm                           bs--utf8
+breton.lm                            br--utf8
+catalan.lm                           ca--utf8
+chinese_simplified.lm                zh-CN-utf8
+chinese_traditional.lm               zh-TW-utf8
+croatian.lm                          hr--utf8
+czech.lm                             cs--utf8
+danish.lm                            da--utf8
+dutch.lm                             nl--utf8
+english.lm                           en--utf8
+esperanto.lm                         eo--utf8
+estonian.lm                          et--utf8
+finnish.lm                           fi--utf8
+french.lm                            fr--utf8
+frisian.lm                           fy--utf8
+georgian.lm                          ka--utf8
+german.lm                            de--utf8
+greek.lm                             el--utf8
+hebrew.lm                            he--utf8
+hindi.lm                             hi--utf8
+hungarian.lm                         hu--utf8
+icelandic.lm                         is--utf8
+indonesian.lm                        id--utf8
+irish_gaelic.lm                      ga--utf8
+italian.lm                           it--utf8
+japanese.lm                          ja--utf8
+korean.lm                            ko--utf8
+latin.lm                             la--utf8
+latvian.lm                           lv--utf8
+lithuanian.lm                        lt--utf8
+luxembourgish.lm                     lb--utf8
+malay.lm                             ms--utf8
+manx_gaelic.lm                       gv--utf8
+marathi.lm                           mr--utf8
+mongolian_cyrillic.lm                mn--utf8
+nepali.lm                            ne--utf8
+norwegian.lm                         nb--utf8       # Norwegian (Bokmal)
+persian.lm                           fa--utf8       # Farsi
+polish.lm                            pl--utf8
+portuguese.lm                        pt-PT-utf8
+quechua.lm                           qu--utf8
+romanian.lm                          ro--utf8
+romansh.lm                           rm--utf8
+russian.lm                           ru--utf8
+sanskrit.lm                          sa--utf8
+scots.lm                             sco--utf8
+scots_gaelic.lm                      gd--utf8
+serbian.lm                           sr--utf-8
+serbian-latin.lm                     sh--utf-8
+slovak_ascii.lm                      sk-SK-utf8
+slovenian.lm                         sl--utf8
+spanish.lm                           es--utf8
+swahili.lm                           sw--utf8
+swedish.lm                           sv--utf8
+tagalog.lm                           tl--utf8
+tamil.lm                             ta--utf8
+thai.lm                              th--utf8
+turkish.lm                           tr--utf8
+ukrainian.lm                         uk--utf8
+vietnamese.lm                        vi--utf8
+welsh.lm                             cy--utf8
+yiddish_utf.lm                       yi--utf8
+zulu.lm                              zu--utf8

Added: libtextcat-2.2-OOo.patch
===================================================================
--- libtextcat-2.2-OOo.patch	                        (rev 0)
+++ libtextcat-2.2-OOo.patch	2011-05-07 14:06:48 UTC (rev 122957)
@@ -0,0 +1,634 @@
+diff -ruN libtextcat-2.2.part1/src/constants.h libtextcat-2.2/src/constants.h
+--- libtextcat-2.2.part1/src/constants.h	2007-07-25 10:46:49.000000000 +0100
++++ libtextcat-2.2/src/constants.h	2007-07-25 10:47:25.000000000 +0100
+@@ -39,6 +39,8 @@
+  */
+ #include <limits.h>
+ 
++#define _UTF8_
++
+ #define DESCRIPTION "out of place"
+ 
+ /* Reported matches are those fingerprints with a score less than best
+@@ -59,14 +61,21 @@
+ /* Maximum number of n-grams in a fingerprint */
+ #define MAXNGRAMS  400
+ 
+-/* Maximum size of an n-gram? */
+-#define MAXNGRAMSIZE 5
++/* Maximum number of character of an n-gram? */
++#define MAXNGRAMSYMBOL 5
++
++/* Maximum size of the string representing an n-gram (must be greater than number of symbol) */
++#ifdef _UTF8_
++#define MAXNGRAMSIZE 20
++#else
++#define MAXNGRAMSIZE MAXNGRAMSYMBOL
++#endif
+ 
+ /* Which characters are not acceptable in n-grams? */
+ #define INVALID(c) (isspace((int)c) || isdigit((int)c)) 
+ 
+ /* Minimum size (in characters) for accepting a document */
+-#define MINDOCSIZE  25
++#define MINDOCSIZE  6
+ 
+ /* Maximum penalty for missing an n-gram in fingerprint */
+ #define MAXOUTOFPLACE 400
+@@ -76,4 +85,7 @@
+ 
+ #define MAXSCORE  INT_MAX
+ 
++/* where the fingerprints files are stored */
++#define DEFAULT_FINGERPRINTS_PATH ""
++
+ #endif
+diff -ruN libtextcat-2.2.part1/src/fingerprint.c libtextcat-2.2/src/fingerprint.c
+--- libtextcat-2.2.part1/src/fingerprint.c	2007-07-25 10:46:49.000000000 +0100
++++ libtextcat-2.2/src/fingerprint.c	2007-07-25 10:47:25.000000000 +0100
+@@ -63,6 +63,10 @@
+  * - put table/heap datastructure in a separate file.
+  */
+ 
++#ifndef _UTF8_
++#define _UTF8_
++#endif
++
+ #include "config.h"
+ #include <stdio.h>
+ #ifdef HAVE_STDLIB_H
+@@ -80,10 +84,12 @@
+ #include "wg_mempool.h"
+ #include "constants.h"
+ 
++#include "utf8misc.h"
+ 
+ #define TABLESIZE  (1<<TABLEPOW)
+ #define TABLEMASK  ((TABLESIZE)-1)
+ 
++
+ typedef struct {
+ 
+ 	sint2 rank;
+@@ -134,29 +140,14 @@
+ }
+ 
+ 
+-/* checks if n-gram lex is a prefix of key and of length len */
+-inline int issame( char *lex, char *key, int len )
+-{
+-	int i;
+-	for (i=0; i<len; i++) {
+-		if ( key[i] != lex[i] ) {
+-			return 0;
+-		}
+-	}
+-	if ( lex[i] != 0 ) {
+-		return 0;
+-	}
+-	return 1;
+-}
+-
+ 
+ /* increases frequency of ngram(p,len) */
+-static inline int increasefreq( table_t *t, char *p, int len ) 
+-{	
+-	uint4 hash = simplehash( p, len ) & TABLEMASK;				
++static int increasefreq( table_t *t, char *p, int len )
++{
++	uint4 hash = simplehash( p, len ) & TABLEMASK;
+ 	entry_t *entry = t->table[ hash ];
+-	
+-	while ( entry ) {				
++
++	while ( entry ) {
+ 		if ( issame( entry->str, p, len ) ) {
+ 			/*** Found it! ***/
+ 			entry->cnt++;
+@@ -168,7 +159,7 @@
+ 	}
+ 
+ 	/*** Not found, so create ***/
+-	entry = wgmempool_alloc( t->pool, sizeof(entry_t) );
++        entry = (entry_t*)(wgmempool_alloc( t->pool, sizeof(entry_t) ));
+ 	strcpy( entry->str, p );
+ 	entry->cnt = 1;
+ 
+@@ -181,12 +172,12 @@
+ #if 0
+ 
+ /* looks up ngram(p,len) */
+-static entry_t *findfreq( table_t *t, char *p, int len ) 
+-{	
+-	uint4 hash = simplehash( p, len ) & TABLEMASK;				
++static entry_t *findfreq( table_t *t, char *p, int len )
++{
++	uint4 hash = simplehash( p, len ) & TABLEMASK;
+ 	entry_t *entry = t->table[ hash ];
+-	
+-	while ( entry ) {				
++
++	while ( entry ) {
+ 		if ( issame( entry->str, p, len ) ) {
+ 			return entry;
+ 		}
+@@ -219,7 +210,7 @@
+ #define GREATER(x,y) ((x).cnt > (y).cnt)
+ #define LESS(x,y)    ((x).cnt < (y).cnt)
+ 
+-inline static void siftup( table_t *t, unsigned int child )
++static void siftup( table_t *t, unsigned int child )
+ {
+ 	entry_t *heap = t->heap;
+ 	unsigned int parent = (child-1) >> 1;
+@@ -241,7 +232,7 @@
+ }
+ 
+ 
+-inline static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
++static void siftdown( table_t *t, unsigned int heapsize, uint4 parent )
+ {
+ 	entry_t *heap = t->heap;
+ 	unsigned int child = parent*2 + 1;
+@@ -458,21 +449,27 @@
+ 	return dest;
+ }
+ 
+-
++/**
++* this function extract all n-gram from past buffer and put them into the table "t"
++* [modified] by Jocelyn Merand to accept utf-8 multi-character symbols to be used in OpenOffice
++*/
+ static void createngramtable( table_t *t, const char *buf )
+ {
+ 	char n[MAXNGRAMSIZE+1];
+ 	const char *p = buf;
+ 	int i;
++        int pointer = 0;
+ 
+ 	/*** Get all n-grams where 1<=n<=MAXNGRAMSIZE. Allow underscores only at borders. ***/
+-	for (;;p++) {
++	while(1) {
+ 
+-		const char *q = p;
++     const char *q = &p[pointer];   /*[modified] previously p++ above (for(;;p++)) now, it's pointer wich is increased so we have to get the new pointer on the buffer*/
+ 		char *m = n;
+ 
+ 		/*** First char may be an underscore ***/
+-		*m++ = *q++;
++                int decay = charcopy(q, m); /*[modified] previously *q++ = *m++*/
++                q = &(p[pointer+decay]);    /*[modified] the old copying method do not manage multi-character symbols*/
++                m += decay; /*[modified]*/
+ 		*m = '\0';
+ 
+ 		increasefreq( t, n, 1 );
+@@ -482,19 +479,22 @@
+ 		}
+ 
+ 		/*** Let the compiler unroll this ***/
+-		for ( i=2; i<=MAXNGRAMSIZE; i++) {
++		for ( i=2; i<=MAXNGRAMSYMBOL; i++) {
+ 
+-			*m++ = *q;
++                        decay = charcopy(q, m); /*[modified] like above*/
++                        m += decay;
+ 			*m = '\0';
+ 
+ 			increasefreq( t, n, i );
+ 
+ 			if ( *q == '_' ) break;
+-			q++;
++                        q += decay;
+ 			if ( *q == '\0' ) {
+ 				return;
+ 			}
+ 		}
++
++  pointer = nextcharstart(p,pointer);   /*[modified] p[pointer] must point on the next start of symbol, but whith utf next start is not surely next char*/
+ 	}
+ 	return;
+ }
+diff -ruN libtextcat-2.2.part1/src/fingerprint.h.orig libtextcat-2.2/src/fingerprint.h.orig
+--- libtextcat-2.2.part1/src/fingerprint.h.orig	1970-01-01 01:00:00.000000000 +0100
++++ libtextcat-2.2/src/fingerprint.h.orig	2007-07-25 10:47:22.000000000 +0100
+@@ -0,0 +1,55 @@
++#ifndef _FINGERPRINT_H_
++#define _FINGERPRINT_H_
++/*
++ * Copyright (C) 2003 WiseGuys Internet B.V.
++ *
++ * THE BSD LICENSE
++ * 
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 
++ * - Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the
++ * distribution.
++ * 
++ * - Neither the name of the WiseGuys Internet B.V. nor the names of
++ * its contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ * 
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++#include "common.h"
++
++#ifdef __cplusplus
++extern "C" {
++#endif
++
++extern void *fp_Init(const char *name);
++extern void fp_Done( void *handle );
++extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams );
++extern int fp_Read( void *handle, const char *fname, int maxngrams );
++extern sint4 fp_Compare( void *cat, void *unknown, int cutoff );
++extern void fp_Show( void *handle );
++extern const char *fp_Name( void *handle );
++extern void fp_Print( void *handle, FILE *fp );
++
++#ifdef __cplusplus
++}
++#endif
++
++#endif
+diff -ruN libtextcat-2.2.part1/src/textcat.c libtextcat-2.2/src/textcat.c
+--- libtextcat-2.2.part1/src/textcat.c	2007-07-25 10:46:49.000000000 +0100
++++ libtextcat-2.2/src/textcat.c	2007-07-25 10:47:25.000000000 +0100
+@@ -74,6 +74,7 @@
+ typedef struct {
+ 
+ 	void **fprint;
++        char *fprint_disable;
+ 	uint4 size;
+ 	uint4 maxsize;
+ 
+@@ -112,11 +113,21 @@
+ 		fp_Done( h->fprint[i] );
+ 	}
+ 	wg_free( h->fprint );
++        wg_free( h->fprint_disable );
+ 	wg_free( h );
+ 
+ }
+ 
+-extern void *textcat_Init( const char *conffile )
++/** Replaces older function */
++extern void *textcat_Init( const char *conffile ){
++    return special_textcat_Init( conffile, DEFAULT_FINGERPRINTS_PATH );
++}
++
++/**
++ * Originaly this function had only one parameter (conffile) it has been modified since OOo use
++ * Basicaly prefix is the directory path where fingerprints are stored
++ */
++extern void *special_textcat_Init( const char *conffile, const char *prefix )
+ {
+ 	textcat_t *h;
+ 	char line[1024];
+@@ -134,11 +145,13 @@
+ 	h->size = 0;
+ 	h->maxsize = 16;
+ 	h->fprint = (void **)wg_malloc( sizeof(void*) * h->maxsize );
++ h->fprint_disable = (char *)wg_malloc( sizeof(char*) * h->maxsize );   /*added to store the state of languages*/
+ 
+ 	while ( wg_getline( line, 1024, fp ) ) {
+ 		char *p;
+ 		char *segment[4];
+-		int res;
++                char finger_print_file_name[512];
++                int res;
+ 
+ 		/*** Skip comments ***/
+ #ifdef HAVE_STRCHR
+@@ -156,17 +169,23 @@
+ 		/*** Ensure enough space ***/
+ 		if ( h->size == h->maxsize ) {
+ 			h->maxsize *= 2;
+-			h->fprint = (void *)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
++			h->fprint = (void **)wg_realloc( h->fprint, sizeof(void*) * h->maxsize );
++                        h->fprint_disable = (char *)wg_realloc( h->fprint_disable, sizeof(char*) * h->maxsize );
+ 		}
+ 
+ 		/*** Load data ***/
+ 		if ((h->fprint[ h->size ] = fp_Init( segment[1] ))==NULL) {
+ 			goto ERROR;
+ 		}
+-		if ( fp_Read( h->fprint[h->size], segment[0], 400 ) == 0 ) {
++                finger_print_file_name[0] = '\0';
++                strcat(finger_print_file_name, prefix);
++                strcat(finger_print_file_name, segment[0]);
++
++                if ( fp_Read( h->fprint[h->size], finger_print_file_name, 400 ) == 0 ) {
+ 			textcat_Done(h);
+ 			goto ERROR;
+-		}		
++		}
++                h->fprint_disable[h->size] = 0xF0;  /*0xF0 is the code for enabled languages, 0x0F is for disabled*/
+ 		h->size++;
+ 	}
+ 
+@@ -203,11 +222,18 @@
+ 		result = _TEXTCAT_RESULT_SHORT;
+ 		goto READY;
+ 	}
+-	
++
+ 	/*** Calculate the score for each category. ***/
+ 	for (i=0; i<h->size; i++) {
+-		int score = fp_Compare( h->fprint[i], unknown, threshold );
+-		candidates[i].score = score;
++                int score;
++                if(h->fprint_disable[i] & 0x0F){    /*if this language is disabled*/
++                    score = MAXSCORE;
++                }
++                else{
++                    score = fp_Compare( h->fprint[i], unknown, threshold );
++                    /*printf("Score for %s : %i\n", fp_Name(h->fprint[i]), score);*/
++                }
++                candidates[i].score = score;
+ 		candidates[i].name = fp_Name( h->fprint[i] );
+ 		if ( score < minscore ) {
+ 			minscore = score;
+diff -ruN libtextcat-2.2.part1/src/textcat.h libtextcat-2.2/src/textcat.h
+--- libtextcat-2.2.part1/src/textcat.h	2007-07-25 10:46:49.000000000 +0100
++++ libtextcat-2.2/src/textcat.h	2007-07-25 10:48:18.000000000 +0100
+@@ -55,10 +54,19 @@
+  * Returns: handle on success, NULL on error. (At the moment, the
+  * only way errors can occur, is when the library cannot read the
+  * conffile, or one of the fingerprint files listed in it.)
++ *
++ * Replace older function (and has exacly the same behaviour)
++ * see below
+  */
+ extern void *textcat_Init( const char *conffile );
+ 
+ /**
++ * Originaly this function had only one parameter (conffile) it has been modified since OOo must be able to load alternativ DB
++ * Basicaly prefix is the directory path where fingerprints are stored
++ */
++extern void *special_textcat_Init( const char *conffile, const char *prefix );
++
++/**
+  * textcat_Done() - Free up resources for handle
+  */
+ extern void textcat_Done( void *handle );
+diff -ruN libtextcat-2.2.part1/src/utf8misc.c libtextcat-2.2/src/utf8misc.c
+--- libtextcat-2.2.part1/src/utf8misc.c	1970-01-01 01:00:00.000000000 +0100
++++ libtextcat-2.2/src/utf8misc.c	2007-07-25 10:48:57.000000000 +0100
+@@ -0,0 +1,132 @@
++/***************************************************************************
++ *   Copyright (C) 2006 by Jocelyn Merand                                  *
++ *   joc.mer at gmail.com                                                     *
++ *                                                                         *
++ * THE BSD LICENSE
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the
++ * distribution.
++ *
++ * - Neither the name of the WiseGuys Internet B.V. nor the names of
++ * its contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ ***************************************************************************/
++
++#ifndef _UTF8_MISC_H_
++#include "utf8misc.h"
++#endif
++
++
++int nextcharstart(const char *str, int position){
++    int pointer = position;
++
++    if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
++
++        /*then str[pointer] is an escape character*/
++
++    char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count (by bit translation) following characters (only the weightest part)*/
++
++    while(escape_char & ESCAPE_MASK && str[pointer]){/*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
++        escape_char = escape_char <<1;
++        ++pointer;
++    }
++    }
++    if(str[pointer]){   /*finaly, if we are not on the \0 character, we jump to the next character*/
++        ++pointer;
++    }
++    return pointer;
++}
++
++
++int charcopy(const char *str, char *dest){
++
++    int pointer = 0;
++    if(str[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
++
++        /*then str[pointer] is an escape character*/
++
++        char escape_char = ((str[pointer] & WEIGHT_MASK) << 1); /*and we use it to count following characters (only the weightest part)*/
++
++        while(escape_char & ESCAPE_MASK && str[pointer]){   /*every step, we move the byte of 1 bit left, when first bit is 0, it's finished*/
++            dest[pointer] = str[pointer];
++            escape_char = escape_char <<1;
++            ++pointer;
++        }
++    }
++    if(str[pointer]){
++        dest[pointer] = str[pointer];
++        ++pointer;
++    }
++
++    return pointer;
++}
++
++
++int issame( char *lex, char *key, int len )
++{
++    /*printf("[%s] prefix of [%s] with length %i", lex, key, len);*/
++    int char_counter = 0;
++    int pointer = 0;
++    while(char_counter < len) {
++
++        if(key[pointer] & ESCAPE_MASK){ /*if the first bit of the current char is 1*/
++
++            /*then key[pointer] is an escap character*/
++
++            char escape_char = ((key[pointer] & WEIGHT_MASK) << 1);     /*and we use it to count (only the weightest part)*/
++
++            while(escape_char & ESCAPE_MASK && key[pointer] == lex[pointer] ){
++                escape_char = escape_char <<1;
++                ++pointer;
++            }
++        }
++        ++char_counter; /*and we are on a new utf8 character*/
++        if ( key[pointer] != lex[pointer] ) {
++            return 0;
++            /*printf(" NO\n", lex, key, len);*/
++        }
++        ++pointer;
++    }
++    if ( lex[pointer] != '\0' ) {
++        return 0;
++        /*printf(" NO\n");*/
++    }
++
++    /*printf(" YES\n");*/
++
++    return 1;
++}
++
++
++extern int utfstrlen(const char* str){
++    int char_counter = 0;
++    int pointer = 0;
++    while(str[pointer]) {
++        pointer = nextcharstart(str, pointer);
++
++        ++char_counter; /*and we are on a new utf8 character*/
++    }
++    return char_counter;
++}
++
+diff -ruN libtextcat-2.2.part1/src/utf8misc.h libtextcat-2.2/src/utf8misc.h
+--- libtextcat-2.2.part1/src/utf8misc.h	1970-01-01 01:00:00.000000000 +0100
++++ libtextcat-2.2/src/utf8misc.h	2007-07-25 10:48:57.000000000 +0100
+@@ -0,0 +1,88 @@
++/***************************************************************************
++ *   Copyright (C) 2006 by Jocelyn Merand                                  *
++ *   joc.mer at gmail.com                                                     *
++ *                                                                         *
++ * THE BSD LICENSE
++ *
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ *
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ *
++ * - Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the
++ * distribution.
++ *
++ * - Neither the name of the WiseGuys Internet B.V. nor the names of
++ * its contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ *
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ ***************************************************************************/
++
++#ifndef _UTF8_MISC_H_
++#define _UTF8_MISC_H_
++
++/**
++ * These variables are used in character processing functions
++ * These have been added to manage utf-8 symbols, particularly escape chars
++ */
++#ifdef _UTF8_
++#define ESCAPE_MASK 0x80
++#define WEIGHT_MASK 0xF0
++#else
++#define ESCAPE_MASK 0xFF
++#define WEIGHT_MASK 0x00
++#endif
++
++
++/*
++ * Is used to jump to the next start of char
++ * of course it's only usefull when encoding is utf-8
++ * This function have been added by Jocelyn Merand to use libtextcat in OOo
++ */
++int nextcharstart(const char *str, int position);
++
++
++/*Copy the char in str to dest
++ * of course it's only usefull when encoding is utf8 and the symbol is encoded with more than 1 char
++ * return the number of char jumped
++ * This function have been added by Jocelyn Merand to use libtextcat in OOo
++ */
++int charcopy(const char *str, char *dest);
++
++
++/* checks if n-gram lex is a prefix of key and of length len
++* if _UTF8_ is defined, it uses escap characters and len is not realy the length of lex
++* in this case, len is the number of utf-8 char strlen("€") == 3 but len == 1
++*/
++int issame( char *lex, char *key, int len );
++
++
++/* Counts the number of characters
++* if _UTF8_ is defined, it uses escap characters and the result is not realy the length of str
++* in this case, the result is the number of utf-8 char strlen("€") == 3 but utfstrlen("€") == 1
++*/
++#ifdef __cplusplus
++extern "C" {
++#endif
++extern int utfstrlen(const char* str);
++#ifdef __cplusplus
++}
++#endif
++
++#endif
++
+--- libtextcat-2.2.part2/src/Makefile.am	2007-07-25 10:55:02.000000000 +0100
++++ libtextcat-2.2/src/Makefile.am	2007-07-25 10:55:52.000000000 +0100
+@@ -12,11 +12,11 @@
+ 
+ libtextcat_includedir = $(includedir)/libtextcat
+ libtextcat_include_HEADERS = \
+-	common.h constants.h fingerprint.h textcat.h
++	common.h constants.h fingerprint.h textcat.h utf8misc.h
+ 
+ lib_LTLIBRARIES =	libtextcat.la
+ libtextcat_la_SOURCES = \
+-	common.c fingerprint.c textcat.c wg_mempool.c
++	common.c fingerprint.c textcat.c wg_mempool.c utf8misc.c
+ 
+ bin_PROGRAMS =		createfp
+ createfp_SOURCES =	createfp.c

Added: libtextcat-2.2-exportapi.patch
===================================================================
--- libtextcat-2.2-exportapi.patch	                        (rev 0)
+++ libtextcat-2.2-exportapi.patch	2011-05-07 14:06:48 UTC (rev 122957)
@@ -0,0 +1,305 @@
+diff -ruN libtextcat-2.2.orig/src/common.c libtextcat-2.2/src/common.c
+--- libtextcat-2.2.orig/src/common.c	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/common.c	2007-06-27 17:45:16.000000000 +0100
+@@ -45,7 +45,7 @@
+ #endif
+ #include <stdarg.h>
+ #include <ctype.h>
+-#include "common.h"
++#include "common_impl.h"
+ 
+ extern void wgmem_error( const char *fmt, ... )
+ {
+@@ -55,8 +55,6 @@
+         va_start(ap, fmt);
+         vfprintf(stderr, fmt, ap);
+         va_end(ap);
+-
+-	exit(-1);
+ }
+ 
+ 
+diff -ruN libtextcat-2.2.orig/src/common_impl.h libtextcat-2.2/src/common_impl.h
+--- libtextcat-2.2.orig/src/common_impl.h	1970-01-01 01:00:00.000000000 +0100
++++ libtextcat-2.2/src/common_impl.h	2007-06-27 17:45:16.000000000 +0100
+@@ -0,0 +1,66 @@
++#ifndef _COMMON_IMPL_H_
++#define _COMMON_IMPL_H_
++/**
++ * common_impl.h -- a mixed bag of helper functions 
++ *
++ * Copyright (C) 2003 WiseGuys Internet B.V.
++ *
++ * THE BSD LICENSE
++ * 
++ * Redistribution and use in source and binary forms, with or without
++ * modification, are permitted provided that the following conditions
++ * are met:
++ * 
++ * - Redistributions of source code must retain the above copyright
++ * notice, this list of conditions and the following disclaimer.
++ * 
++ * - Redistributions in binary form must reproduce the above copyright
++ * notice, this list of conditions and the following disclaimer in the
++ * documentation and/or other materials provided with the
++ * distribution.
++ * 
++ * - Neither the name of the WiseGuys Internet B.V. nor the names of
++ * its contributors may be used to endorse or promote products derived
++ * from this software without specific prior written permission.
++ * 
++ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
++ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
++ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
++ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
++ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
++ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
++ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
++ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
++ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
++ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
++ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
++ */
++
++#include "config.h"
++#ifndef HAVE_MALLOC
++#error "This library needs a GNU like malloc to compile. 'configure' says there isn't one."
++#endif
++#ifndef HAVE_REALLOC
++#error "This library needs a GNU like realloc to compile. 'configure' says there isn't one."
++#endif
++#ifndef HAVE_STRDUP
++#error "This library needs a GNU like strdup to compile. 'configure' says there isn't one."
++#endif
++#ifdef HAVE_INTTYPES_H
++#include <inttypes.h>
++#else
++#ifdef HAVE_STDINT_H
++#include <stdint.h>
++#endif
++#endif
++#ifdef HAVE_SYS_TIME_H
++#include <sys/time.h>
++#endif
++#include "common.h"
++
++#define WGMIN(x,y)         ((x)<=(y)?(x):(y))
++#define WGMAX(x,y)         ((x)<=(y)?(y):(x))
++#define __STR__(x)         #x
++#define WGSTR(x)           __STR__(x)
++
++#endif
+diff -ruN libtextcat-2.2.orig/src/createfp.c libtextcat-2.2/src/createfp.c
+--- libtextcat-2.2.orig/src/createfp.c	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/createfp.c	2007-06-27 17:45:16.000000000 +0100
+@@ -44,7 +44,7 @@
+ #endif
+ 
+ #include "fingerprint.h"
+-#include "common.h"
++#include "common_impl.h"
+ 
+ #define BLOCKSIZE 4096
+ 
+diff -ruN libtextcat-2.2.orig/src/fingerprint.c libtextcat-2.2/src/fingerprint.c
+--- libtextcat-2.2.orig/src/fingerprint.c	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/fingerprint.c	2007-06-27 17:45:16.000000000 +0100
+@@ -76,7 +76,7 @@
+ #endif
+ #include <ctype.h>
+ 
+-#include "common.h"
++#include "common_impl.h"
+ #include "wg_mempool.h"
+ #include "constants.h"
+ 
+diff -ruN libtextcat-2.2.orig/src/Makefile.am libtextcat-2.2/src/Makefile.am
+--- libtextcat-2.2.orig/src/Makefile.am	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/Makefile.am	2007-06-27 17:47:40.000000000 +0100
+@@ -8,7 +8,11 @@
+ AM_LDFLAGS =	-g
+ 
+ noinst_HEADERS = \
+-	common.h constants.h fingerprint.h textcat.h wg_mempool.h
++	common_impl.h wg_mempool.h
++
++libtextcat_includedir = $(includedir)/libtextcat
++libtextcat_include_HEADERS = \
++	common.h constants.h fingerprint.h textcat.h
+ 
+ lib_LTLIBRARIES =	libtextcat.la
+ libtextcat_la_SOURCES = \
+diff -ruN libtextcat-2.2.orig/src/testtextcat.c libtextcat-2.2/src/testtextcat.c
+--- libtextcat-2.2.orig/src/testtextcat.c	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/testtextcat.c	2007-06-27 17:45:16.000000000 +0100
+@@ -47,7 +47,7 @@
+ #endif
+ 
+ #include "textcat.h"
+-#include "common.h"
++#include "common_impl.h"
+ 
+ #define BLOCKSIZE 4096
+ 
+diff -ruN libtextcat-2.2.orig/src/textcat.c libtextcat-2.2/src/textcat.c
+--- libtextcat-2.2.orig/src/textcat.c	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/textcat.c	2007-06-27 17:45:16.000000000 +0100
+@@ -65,7 +65,7 @@
+ #include <alloca.h>
+ #endif
+ 
+-#include "common.h"
++#include "common_impl.h"
+ #include "fingerprint.h"
+ #include "textcat.h"
+ #include "constants.h"
+diff -ruN libtextcat-2.2.orig/src/wg_mempool.c libtextcat-2.2/src/wg_mempool.c
+--- libtextcat-2.2.orig/src/wg_mempool.c	2007-06-27 17:02:34.000000000 +0100
++++ libtextcat-2.2/src/wg_mempool.c	2007-06-27 17:45:16.000000000 +0100
+@@ -41,7 +41,7 @@
+ #ifdef HAVE_STRING_H
+ #include <string.h>
+ #endif
+-#include "common.h"
++#include "common_impl.h"
+ 
+ typedef struct memblock_s {
+ 	char *pool;             
+diff -ru libtextcat-2.2.orig/src/common.h libtextcat-2.2/src/common.h
+--- libtextcat-2.2.orig/src/common.h	2003-05-22 14:02:29.000000000 +0100
++++ libtextcat-2.2/src/common.h	2007-06-28 09:10:42.000000000 +0100
+@@ -1,7 +1,7 @@
+ #ifndef _COMMON_H_
+ #define _COMMON_H_
+ /**
+- * common.h -- a mixed bag of helper functions 
++ * common.h
+  *
+  * Copyright (C) 2003 WiseGuys Internet B.V.
+  *
+@@ -36,56 +36,25 @@
+  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  */
+ 
+-#include "config.h"
+-#ifndef HAVE_MALLOC
+-#error "This library needs a GNU like malloc to compile. 'configure' says there isn't one."
+-#endif
+-#ifndef HAVE_REALLOC
+-#error "This library needs a GNU like realloc to compile. 'configure' says there isn't one."
+-#endif
+-#ifndef HAVE_STRDUP
+-#error "This library needs a GNU like strdup to compile. 'configure' says there isn't one."
+-#endif
+ #include <stdio.h>
+-#ifdef HAVE_INTTYPES_H
+ #include <inttypes.h>
+-#else
+-#ifdef HAVE_STDINT_H
+-#include <stdint.h>
+-#endif
+-#endif
+-#ifdef HAVE_SYS_TIME_H
+-#include <sys/time.h>
+-#endif
+ #include <time.h>
+ 
+-#define WGMIN(x,y)         ((x)<=(y)?(x):(y))
+-#define WGMAX(x,y)         ((x)<=(y)?(y):(x))
+-#define __STR__(x)         #x
+-#define WGSTR(x)           __STR__(x)
+-
+-#ifdef HAVE_INTTYPES_H
+-typedef uint32_t	uint4;
+-typedef uint16_t	uint2;
+-typedef uint8_t		uchar;
+-
+-typedef int32_t		sint4;
+-typedef int16_t		sint2;
+-typedef int8_t		schar;
+-
+-typedef int8_t		boole;
+-#else
+-typedef unsigned long	uint4;
+-typedef unsigned int	uint2;
+-typedef unsigned char	uchar;
+-
+-typedef long		sint4;
+-typedef int		sint2;
+-typedef char		schar;
+-
+-typedef char		boole;
++#include <sys/time.h>
++#ifdef __cplusplus
++extern "C" {
+ #endif
+ 
++typedef uint32_t        uint4;
++typedef uint16_t        uint2;
++typedef uint8_t         uchar;
++
++typedef int32_t         sint4;
++typedef int16_t         sint2;
++typedef int8_t          schar;
++
++typedef int8_t          boole;
++
+ typedef struct wgtimer_s {
+         struct timeval start;
+         struct timeval stop;
+@@ -108,6 +76,8 @@
+ extern char *wg_strgmov( char *dest, const char *src, const char *destlimit );
+ extern char *wg_trim( char *dest, const char *src );
+ 
++#ifdef __cplusplus
++}
++#endif
+       
+ #endif
+-
+diff -ru libtextcat-2.2.orig/src/fingerprint.h libtextcat-2.2/src/fingerprint.h
+--- libtextcat-2.2.orig/src/fingerprint.h	2003-05-19 13:16:31.000000000 +0100
++++ libtextcat-2.2/src/fingerprint.h	2007-06-28 09:11:17.000000000 +0100
+@@ -35,6 +35,10 @@
+  */
+ #include "common.h"
+ 
++#ifdef __cplusplus
++extern "C" {
++#endif
++
+ extern void *fp_Init(const char *name);
+ extern void fp_Done( void *handle );
+ extern int fp_Create( void *handle, const char *buffer, uint4 bufsize, uint4 maxngrams );
+@@ -44,4 +48,8 @@
+ extern const char *fp_Name( void *handle );
+ extern void fp_Print( void *handle, FILE *fp );
+ 
++#ifdef __cplusplus
++}
++#endif
++
+ #endif
+--- libtextcat-2.2.orig/src/textcat.h	2007-06-28 09:19:26.000000000 +0100
++++ libtextcat-2.2/src/textcat.h	2007-06-28 09:20:19.000000000 +0100
+@@ -37,6 +37,10 @@
+  */
+ #include <stdio.h>
+ 
++#ifdef __cplusplus
++extern "C" {
++#endif
++
+ #define _TEXTCAT_RESULT_UNKOWN        "UNKNOWN"
+ #define _TEXTCAT_RESULT_SHORT         "SHORT"
+ 
+@@ -77,4 +81,9 @@
+  * textcat_Version() - Returns a string describing the version of this classifier.
+  */
+ extern char *textcat_Version();
++
++#ifdef __cplusplus
++}
++#endif
++
+ #endif




More information about the arch-commits mailing list