File: wstr.c

package info (click to toggle)
cook 2.19-2
links: PTS
area: main
in suites: woody
size: 7,316 kB
ctags: 3,969
sloc: ansic: 50,331; sh: 13,190; makefile: 4,542; yacc: 3,174; perl: 224; awk: 154
file content (906 lines) | stat: -rw-r--r-- 17,568 bytes
parent folder | download | duplicates (3)
/*
 *	cook - file construction tool
 *	Copyright (C) 1997, 1999, 2001 Peter Miller;
 *	All rights reserved.
 *
 *	This program is free software; you can redistribute it and/or modify
 *	it under the terms of the GNU General Public License as published by
 *	the Free Software Foundation; either version 2 of the License, or
 *	(at your option) any later version.
 *
 *	This program is distributed in the hope that it will be useful,
 *	but WITHOUT ANY WARRANTY; without even the implied warranty of
 *	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *	GNU General Public License for more details.
 *
 *	You should have received a copy of the GNU General Public License
 *	along with this program; if not, write to the Free Software
 *	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111, USA.
 *
 *
 * MANIFEST: wide string manipulation functions
 *
 * A literal pool is maintained.  Each string has a reference count.  The
 * string stays in the literal pool for as long as it has a positive
 * reference count.  To determine if a string is already in the literal pool,
 * linear dynamic hashing is used to guarantee an O(1) search.  Making all equal
 * strings the same item in the literal pool means that string equality is
 * a pointer test, and thus very fast.
 */

#include <ac/limits.h>
#include <ac/stdarg.h>
#include <ac/stddef.h>
#include <ac/stdio.h>
#include <ac/stdlib.h>
#include <ac/string.h>
#include <ac/wchar.h>
#include <ac/wctype.h>

#include <error.h>
#include <language.h>
#include <mem.h>
#include <str.h>
#include <trace.h> /* for assert */
#include <wstr.h>


static	wstring_ty	**hash_table;
static	wstr_hash_ty	hash_modulus;
static	wstr_hash_ty	hash_cutover;
static	wstr_hash_ty	hash_cutover_mask;
static	wstr_hash_ty	hash_cutover_split_mask;
static	wstr_hash_ty	hash_split;
static	wstr_hash_ty	hash_load;
static	int		changed;

#define MAX_HASH_LEN 20


/*
 * NAME
 *	hash_generate - hash string to number
 *
 * SYNOPSIS
 *	wstr_hash_ty hash_generate(wchar_t *s, size_t n);
 *
 * DESCRIPTION
 *	The hash_generate function is used to make a number from a string.
 *
 * RETURNS
 *	wstr_hash_ty - the magic number
 *
 * CAVEAT
 *	Only the last MAX_HASH_LEN characters are used.
 *	It is important that wstr_hash_ty be unsigned (int or long).
 */

static wstr_hash_ty hash_generate _((const wchar_t *, size_t));

static wstr_hash_ty
hash_generate(s, n)
	const wchar_t	*s;
	size_t		n;
{
	wstr_hash_ty	retval;

	if (n > MAX_HASH_LEN)
	{
		s += n - MAX_HASH_LEN;
		n = MAX_HASH_LEN;
	}

	retval = 0;
	while (n > 0)
	{
		retval = (retval + (retval << 1)) ^ *s++;
		--n;
	}
	return retval;
}


/*
 * NAME
 *	wstr_initialize - start up string table
 *
 * SYNOPSIS
 *	void wstr_initialize(void);
 *
 * DESCRIPTION
 *	The wstr_initialize function is used to create the hash table and
 *	initialize it to empty.
 *
 * RETURNS
 *	void
 *
 * CAVEAT
 *	This function must be called before any other defined in this file.
 */

static void wstr_initialize _((void));

static void
wstr_initialize()
{
	wstr_hash_ty	j;

	if (hash_modulus)
		return;
	hash_modulus = 1<<8; /* MUST be a power of 2 */
	hash_cutover = hash_modulus;
	hash_split = hash_modulus - hash_cutover;
	hash_cutover_mask = hash_cutover - 1;
	hash_cutover_split_mask = (hash_cutover * 2) - 1;
	hash_load = 0;
	hash_table =
		(wstring_ty **)mem_alloc(hash_modulus * sizeof(wstring_ty *));
	for (j = 0; j < hash_modulus; ++j)
		hash_table[j] = 0;
}


/*
 * NAME
 *	split - reduce table loading
 *
 * SYNOPSIS
 *	void split(void);
 *
 * DESCRIPTION
 *	The split function is used to reduce the load factor on the hash table.
 *
 * RETURNS
 *	void
 *
 * CAVEAT
 *	A load factor of about 80% is suggested.
 */

static void split _((void));

static void
split()
{
	wstring_ty	*p;
	wstring_ty	*p2;
	wstr_hash_ty	idx;

	/*
	 * get the list to be split across buckets
	 */
	p = hash_table[hash_split];
	hash_table[hash_split] = 0;

	/*
	 * increase the modulus by one
	 */
	hash_modulus++;
	hash_table =
		mem_change_size
		(
			hash_table,
			hash_modulus * sizeof(wstring_ty *)
		);
	hash_table[hash_modulus - 1] = 0;
	hash_split = hash_modulus - hash_cutover;
	if (hash_split >= hash_cutover)
	{
		hash_cutover = hash_modulus;
		hash_split = 0;
		hash_cutover_mask = hash_cutover - 1;
		hash_cutover_split_mask = (hash_cutover * 2) - 1;
	}

	/*
	 * now redistribute the list elements
	 */
	while (p)
	{
		p2 = p;
		p = p->wstr_next;

		idx = p2->wstr_hash & hash_cutover_mask;
		if (idx < hash_split)
			idx = p2->wstr_hash & hash_cutover_split_mask;
		p2->wstr_next = hash_table[idx];
		hash_table[idx] = p2;
	}
}


/*
 * NAME
 *	wstr_from_c - make string from C string
 *
 * SYNOPSIS
 *	wstring_ty *wstr_from_c(char *);
 *
 * DESCRIPTION
 *	The wstr_from_c function is used to make a string from a NUL
 *	terminated C string.  The conversion from multi-byte to wide
 *	characters is done in the current locale.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.  Use
 *	wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_from_c(s)
	const char	*s;
{
	return wstr_n_from_c(s, strlen(s));
}


/*
 * NAME
 *	wstr_from_wc - make string from a wide C string
 *
 * SYNOPSIS
 *	wstring_ty *wstr_from_wc(wchar_t *);
 *
 * DESCRIPTION
 *	The wstr_from_c function is used to make a string from a NUL
 *	terminated wide C string.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.  Use
 *	wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_from_wc(ws)
	const wchar_t	*ws;
{
	return wstr_n_from_wc(ws, wcslen(ws));
}


/*
 * NAME
 *	wstr_n_from_c - make string
 *
 * SYNOPSIS
 *	wstring_ty *wstr_n_from_c(char *s, size_t n);
 *
 * DESCRIPTION
 *	The wstr_n_from_c function is used to make a string from an
 *	array of characters.  No NUL terminator is assumed.  The
 *	conversion from muti-byte to wide characters is done in the
 *	current locale.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.  Use
 *	wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_n_from_c(s, length)
	const char	*s;
	size_t		length;
{
#if __STDC__ >= 1
	static char	escapes[] = "\aa\bb\ff\nn\rr\tt\vv";
#else
	static char	escapes[] = "\bb\ff\nn\rr\tt";
#endif
	static wchar_t	*buf;
	static size_t	bufmax;
	size_t		remainder;
	const char	*ip;
	wchar_t		*op;

	/*
	 * Do the conversion "long hand".  This is because some
	 * implementations of the mbstowcs function barf when they see
	 * invalid multi byte character sequences.  This function
	 * renders them as C escape sequences and keeps going.
	 */
	if (bufmax < length)
	{
		bufmax = length;
		/* the 4 is the longest escape sequence */
		buf = mem_change_size(buf, bufmax * sizeof(wchar_t) * 4);
	}

	/*
	 * change the locale to the native language default
	 */
	language_human();

	/*
	 * Reset the mbtowc internal state.
	 */
	mbtowc((wchar_t *)0, (char *)0, 0);

	/*
	 * scan the string and extract the wide characters
	 */
	ip = s;
	op = buf;
	remainder = length;
	while (remainder > 0)
	{
		int		n;

		n = mbtowc(op, ip, remainder);
		if (n == 0)
			break;
		if (n < 0)
		{
			char	*esc;

			/*
			 * Invalid multi byte sequence, replace the
			 * first character with a C escape sequence.
			 */
			esc = strchr(escapes, *ip);
			if (esc)
			{
				*op++ = '\\';
				*op++ = esc[1];
			}
			else
			{
				*op++ = '\\';
				*op++ = '0' + ((*ip >> 6) & 7);
				*op++ = '0' + ((*ip >> 3) & 7);
				*op++ = '0' + ( *ip       & 7);
			}
			++ip;
			--remainder;

			/*
			 * The mbtowc function's internal state will now
			 * be "error" or broken, or otherwise useless.
			 * Reset it so that we can keep going.
			 */
			mbtowc((wchar_t *)0, (char *)0, 0);
		}
		else
		{
			/*
			 * the one wchar_t used n chars
			 */
			ip += n;
			remainder -= n;
			++op;
		}
	}

	/*
	 * change the locale back to the C locale
	 */
	language_C();

	/*
	 * build the result from the image in ``buf''
	 */
	return wstr_n_from_wc(buf, op - buf);
}


/*
 * NAME
 *	wstr_to_mbs - wide string to multi-byte C string
 *
 * SYNOPSIS
 *	void wstr_to_mbs(wstring_ty *s, char **rslt, size_t *rslt_len);
 *
 * DESCRIPTION
 *	The wstr_to_mbs function convers a wide character string into a
 *	multi-byte C string.  The conversion is done in the current
 *	locale.  The result is NUL terminated, however the result length
 *	does not include the NUL.
 *
 * CAVEAT
 *	DO NOT free the result.  The result will change between calls,
 *	so copy it if you need to keep it.
 */

void
wstr_to_mbs(s, result_p, result_length_p)
	wstring_ty	*s;
	char		**result_p;
	size_t		*result_length_p;
{
	static char	*buf;
	static size_t	bufmax;
	int		n;
	wchar_t		*ip;
	size_t		remainder;
	char		*op;
	size_t		buflen;

	/*
	 * Do the conversion "long hand".  This is because the wcstombs
	 * function barfs when it sees an invalid wchar_t.  This
	 * function treats them literally and keeps going.
	 *
	 * MB_LEN_MAX is used because the standard guarantees that it is
	 * a non-zero constant.  MB_CUR_MAX is an expression, and
	 * experience has shown (e.g. SunOS 4.1.3) this it may be zero!
	 */
	buflen = (s->wstr_length + 1) * MB_LEN_MAX;
	assert(buflen > 0);
	if (buflen > bufmax)
	{
		bufmax = buflen;
		buf = mem_change_size(buf, bufmax);
	}

	/*
	 * perform the conversion in the native language default
	 */
	language_human();

	/*
	 * The wctomb function has internal state.  It needs to be reset.
	 */
	wctomb((char *)0, (wchar_t)0);

	ip = s->wstr_text;
	remainder = s->wstr_length;
	op = buf;
	while (remainder > 0)
	{
		n = wctomb(op, *ip);
		if (n <= 0)
		{
			/*
			 * Copy the character literally.
			 * Throw away anything that will not fit.
			 */
			*op++ = *ip++;
			if (!op[-1])
				op[-1] = '?';
			--remainder;

			/*
			 * The wctomb function's internal state will now
			 * be "error" or broken, or otherwise useless.
			 * Reset it so that we can keep going.
			 */
			wctomb((char *)0, (wchar_t)0);
		}
		else
		{
			op += n;
			++ip;
			--remainder;
		}
	}

	/*
	 * The final NUL could require shift state end characters,
	 * meaning that n could be more than 1.
	 */
	n = wctomb(op, (wchar_t)0);
	if (n <= 0)
		*op = 0;
	else
	{
		op += n - 1;
		assert(*op == 0);
	}

	/*
	 * restore the locale to the C locale
	 */
	language_C();

	/*
	 * set the output side effects
	 */
	*result_p = buf;
	*result_length_p = op - buf;
}


/*
 * NAME
 *	wstr_n_from_wc - make string
 *
 * SYNOPSIS
 *	wstring_ty *wstr_n_from_wc(wchar_t *s, size_t n);
 *
 * DESCRIPTION
 *	The wstr_n_from_c function is used to make a string from an
 *	array of wide characters.  No NUL terminator is assumed.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.  Use
 *	wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_n_from_wc(s, length)
	const wchar_t	*s;
	size_t		length;
{
	wstr_hash_ty	hash;
	wstr_hash_ty	idx;
	wstring_ty	*p;

	if (!hash_modulus)
		wstr_initialize();
	hash = hash_generate(s, length);

	idx = hash & hash_cutover_mask;
	if (idx < hash_split)
		idx = hash & hash_cutover_split_mask;

	for (p = hash_table[idx]; p; p = p->wstr_next)
	{
		if
		(
			p->wstr_hash == hash
		&&
			p->wstr_length == length
		&&
			!memcmp(p->wstr_text, s, length * sizeof(wchar_t))
		)
		{
			p->wstr_references++;
			return p;
		}
	}

	p =
		(wstring_ty *)
		mem_alloc(sizeof(wstring_ty) + length * sizeof(wchar_t));
	p->wstr_hash = hash;
	p->wstr_length = length;
	p->wstr_references = 1;
	p->wstr_next = hash_table[idx];
	hash_table[idx] = p;
	memcpy(p->wstr_text, s, length * sizeof(wchar_t));
	p->wstr_text[length] = 0;

	hash_load++;
	while (hash_load * 10 > hash_modulus * 8)
		split();
	++changed;
	return p;
}


/*
 * NAME
 *	wstr_copy - make a copy of a string
 *
 * SYNOPSIS
 *	wstring_ty *wstr_copy(wstring_ty *s);
 *
 * DESCRIPTION
 *	The wstr_copy function is used to make a copy of a string.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.
 *	Use wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_copy(s)
	wstring_ty	*s;
{
	s->wstr_references++;
	return s;
}


/*
 * NAME
 *	wstr_free - release a string
 *
 * SYNOPSIS
 *	void wstr_free(wstring_ty *s);
 *
 * DESCRIPTION
 *	The wstr_free function is used to indicate that a string hash been
 *	finished with.
 *
 * RETURNS
 *	void
 *
 * CAVEAT
 *	This is the only way to release strings DO NOT use the free function.
 */

void
wstr_free(s)
	wstring_ty	*s;
{
	wstr_hash_ty	idx;
	wstring_ty	**spp;

	if (!s)
		return;
	if (s->wstr_references > 1)
	{
		s->wstr_references--;
		return;
	}
	++changed;

	/*
	 * find the hash bucket it was in,
	 * and remove it
	 */
	idx = s->wstr_hash & hash_cutover_mask;
	if (idx < hash_split)
		idx = s->wstr_hash & hash_cutover_split_mask;
	for (spp = &hash_table[idx]; *spp; spp = &(*spp)->wstr_next)
	{
		if (*spp == s)
		{
			*spp = s->wstr_next;
			free(s);
			--hash_load;
			return;
		}
	}

	/*
	 * should never reach here!
	 */
	fatal_raw("attempted to free non-existent wstring (bug)");
}


/*
 * NAME
 *	wstr_catenate - join two strings
 *
 * SYNOPSIS
 *	wstring_ty *wstr_catenate(wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *	The wstr_catenate function is used to concatenate two strings to form a
 *	new string.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.
 *	Use wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_catenate(s1, s2)
	wstring_ty	*s1;
	wstring_ty	*s2;
{
	static wchar_t	*tmp;
	static size_t	tmplen;
	wstring_ty	*s;
	size_t		length;

	length = s1->wstr_length + s2->wstr_length;
	if (length > tmplen)
	{
		tmplen = length;
		tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t));
	}
	memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
	memcpy
	(
		tmp + s1->wstr_length,
		s2->wstr_text,
		s2->wstr_length * sizeof(wchar_t)
	);
	s = wstr_n_from_wc(tmp, length);
	return s;
}


/*
 * NAME
 *	wstr_cat_three - join three strings
 *
 * SYNOPSIS
 *	wstring_ty *wstr_cat_three(wstring_ty *, wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *	The wstr_cat_three function is used to concatenate three strings to form
 *	a new string.
 *
 * RETURNS
 *	wstring_ty* - a pointer to a string in dynamic memory.
 *	Use wstr_free when finished with.
 *
 * CAVEAT
 *	The contents of the structure pointed to MUST NOT be altered.
 */

wstring_ty *
wstr_cat_three(s1, s2, s3)
	wstring_ty	*s1;
	wstring_ty	*s2;
	wstring_ty	*s3;
{
	static wchar_t	*tmp;
	static size_t	tmplen;
	wstring_ty	*s;
	size_t		length;

	length = s1->wstr_length + s2->wstr_length + s3->wstr_length;
	if (tmplen < length)
	{
		tmplen = length;
		tmp = mem_change_size(tmp, tmplen * sizeof(wchar_t));
	}
	memcpy(tmp, s1->wstr_text, s1->wstr_length * sizeof(wchar_t));
	memcpy
	(
		tmp + s1->wstr_length,
		s2->wstr_text,
		s2->wstr_length * sizeof(wchar_t)
	);
	memcpy
	(
		tmp + s1->wstr_length + s2->wstr_length,
		s3->wstr_text,
		s3->wstr_length * sizeof(wchar_t)
	);
	s = wstr_n_from_wc(tmp, length);
	return s;
}


wstring_ty *
wstr_to_upper(ws)
	const wstring_ty *ws;
{
	static wchar_t	*buffer;
	static size_t	buflen;
	size_t		j;

	if (ws->wstr_length > buflen)
	{
		buflen = ws->wstr_length;
		buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
	}
	language_human();
	for (j = 0; j < ws->wstr_length; ++j)
	{
		wchar_t		c;

		c = ws->wstr_text[j];
		if (iswlower(c))
			c = towupper(c);
		buffer[j] = c;
	}
	language_C();
	return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_lower(ws)
	const wstring_ty *ws;
{
	static wchar_t	*buffer;
	static size_t	buflen;
	size_t		j;

	if (ws->wstr_length > buflen)
	{
		buflen = ws->wstr_length;
		buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
	}
	language_human();
	for (j = 0; j < ws->wstr_length; ++j)
	{
		wchar_t		c;

		c = ws->wstr_text[j];
		if (iswupper(c))
			c = towlower(c);
		buffer[j] = c;
	}
	language_C();
	return wstr_n_from_wc(buffer, ws->wstr_length);
}


wstring_ty *
wstr_to_ident(ws)
	const wstring_ty *ws;
{
	static wchar_t	*buffer;
	static size_t	buflen;
	size_t		j;

	if (ws->wstr_length == 0)
		return wstr_from_c("_");
	if (ws->wstr_length > buflen)
	{
		buflen = ws->wstr_length;
		buffer = mem_change_size(buffer, buflen * sizeof(wchar_t));
	}
	language_human();
	for (j = 0; j < ws->wstr_length; ++j)
	{
		wchar_t		c;

		c = ws->wstr_text[j];
		if (!iswalnum(c))
			c = '_';
		buffer[j] = c;
	}
	if (iswdigit(buffer[0]))
		buffer[0] = '_';
	language_C();
	return wstr_n_from_wc(buffer, ws->wstr_length);
}


/*
 * NAME
 *	wstr_equal - test equality of strings
 *
 * SYNOPSIS
 *	int wstr_equal(wstring_ty *, wstring_ty *);
 *
 * DESCRIPTION
 *	The wstr_equal function is used to test if two strings are equal.
 *
 * RETURNS
 *	int; zero if the strings are not equal, nonzero if the strings are
 *	equal.
 *
 * CAVEAT
 *	This function is implemented as a macro in strings.h
 */

#ifndef wstr_equal

int
wstr_equal(s1, s2)
	wstring_ty	*s1;
	wstring_ty	*s2;
{
	return (s1 == s2);
}

#endif


wstring_ty *
str_to_wstr(s)
	string_ty	*s;
{
	return wstr_n_from_c(s->str_text, s->str_length);
}


string_ty *
wstr_to_str(ws)
	wstring_ty	*ws;
{
	char		*text;
	size_t		length;

	wstr_to_mbs(ws, &text, &length);
	return str_n_from_c(text, length);
	trace(("to silence warnings\n"));
}