#include "utf8.h"
/* vim:ts=4:sw=4:noet
 * (tabspace=4)
 * 
 * Copyright (C) 2004, 2005 Walter Doekes, <walter@djcvt.net>.
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include "texts.h"
#ifdef _WIN32
#	define WINDOWS_LEAN_AND_MEAN
#	include <windows.h>
#endif /* _WIN32 */
#include <malloc.h>
#include <string.h>

/*
 * UTF-8 conversion table:
 * 
 * 0x00000000 - 0x0000007F
 *   0xxxxxxx
 * 0x00000080 - 0x000007FF
 *   110xxxxx 10xxxxxx
 * 0x00000800 - 0x0000FFFF
 *   1110xxxx 10xxxxxx 10xxxxxx
 * 0x00010000 - 0x001FFFFF
 *   11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 0x00200000 - 0x03FFFFFF
 *   111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 0x04000000 - 0x7FFFFFFF
 *   1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 * 
 */

size_t wcstoautf8(unsigned char** dest, const wchar_t* src) {
	/* At most 4 characters in UTF-8, add the terminating 0 (null). */
	int len = wcslen(src) * 4 + 1;
	size_t ret;
	*dest = (unsigned char*)malloc(len * sizeof(unsigned char));
	if(!*dest) {
#ifdef USE_SETERROR2
		set_error("malloc", -1);
#endif /* USE_SETERROR2 */
		return -1;
	}
	ret = wcstoutf8(*dest, src, len);
	if(ret == (size_t)-1)
		free(*dest);
	return ret;
}	

size_t wcstoutf8(unsigned char* dest, const wchar_t* src, size_t n) {
#ifdef _WIN32
	int ret;
	int len = wcslen(src);
	if(len == 0) {
		if(dest && n != 0)
			*dest = '\0';
		return 0;
	}
	++len;
	ret = WideCharToMultiByte(
			CP_UTF8,	/* code page */
			0,			/* performance and mapping flags */
			src,		/* wide-character string */
			len,		/* number of chars in string */
			dest,		/* buffer for new string */
			n,			/* size of buffer */
			NULL,		/* default for unmappable chars */
			NULL		/* set when default char used */
	);
#ifdef USE_SETERROR2
	if(ret == 0)
		set_error("WideCharToMultiByte", -1);
#endif /* USE_SETERROR2 */
	return (size_t)(ret == 0 ? -1 : ret - 1);
#else /* !_WIN32 */	
	size_t count = 0;
	wchar_t ch;
	while((ch = *src) != L'\0' && (!dest || count < n)) {
		int mask, len, i;
		if(ch < 0x80) {
			len = 1;
			mask = 0x0;  /* 0xxxxxxx */
		} else if(ch < 0x800) {
			len = 2;
			mask = 0xc0; /* 110xxxxx */
		} else if(ch < 0x10000) {
			len = 3;
			mask = 0xe0; /* 1110xxxx */
		} else if(ch < 0x200000) {
			len = 4;
			mask = 0xf0; /* 11110xxx */
		} else {
			if(dest)
				*dest = '\0';
#ifdef USE_SETERROR2
			set_error("wcstoutf8", UTF8_INVALID_UNICODE);
#endif /* USE_SETERROR2 */
			return (size_t)-1;
		}
		if(dest) {
			if(count + len > n) {
				*dest = '\0';
				return count;
			}
			for(i = len - 1; i > 0; --i) {
				dest[i] = (ch & 0x3f) | 0x80; /* 00111111, 10000000 */
				ch >>= 6;
			}
			dest[0] = ch | mask;
			dest += len;
		}
		++src;
		count += len;
	}
	if(dest && count < n)
		*dest = L'\0';
	return count;
#endif /* !_WIN32 */	
}

size_t utf8toawcs(wchar_t** dest, const unsigned char* src) {
	/* At most 1 wide character per UTF-8, add the terminating 0 (null). */
	int len = strlen(src) + 1;
	size_t ret;
	*dest = (wchar_t*)malloc(len * sizeof(wchar_t));
	if(!*dest) {
#ifdef USE_SETERROR2
		set_error("malloc", -1);
#endif /* USE_SETERROR2 */
		return -1;
	}
	ret = utf8towcs(*dest, src, len);
	if(ret == (size_t)-1)
		free(*dest);
	return ret;
}	

size_t utf8towcs(wchar_t* dest, const unsigned char* src, size_t n) {
#ifdef _WIN32
	int ret;
	int len = strlen(src);
	if(len == 0) {
		if(dest && n != 0)
			*dest = L'\0';
		return 0;
	}
	++len;
	ret = MultiByteToWideChar(
			CP_UTF8,				/* code page */
			MB_ERR_INVALID_CHARS,	/* character-type options */
			src,					/* string to map */
			len,					/* number of bytes in string */
			dest,					/* wide-character buffer */
			n						/* size of buffer */
	);
#ifdef USE_SETERROR2
	if(ret == 0)
		set_error("MultiByteToWideChar", -1);
#endif /* USE_SETERROR2 */
	return (size_t)(ret == 0 ? -1 : ret - 1);
#else /* !_WIN32 */
	size_t count = 0;
	while(*src != '\0' && (!dest || count < n)) {
		int mask, len, i;
		if(*src < 0x80) { /* 0xxxxxxx */
			len = 1;
			mask = 0x7f; /* 01111111 */
		} else if((*src & 0xe0) == 0xc0) { /* 11100000, 110xxxxx */
			len = 2;
			mask = 0x1f; /* 00011111 */
		} else if((*src & 0xf0) == 0xe0) { /* 11110000, 1110xxxx */
			len = 3;
			mask = 0x0f; /* 00001111 */
		} else if((*src & 0xf8) == 0xf0) { /* 11111000, 11110xxx */
			len = 4;
			mask = 0x07; /* 00000111 */
		} else {
			if(dest)
				*dest = L'\0';
#ifdef USE_SETERROR2
			set_error("utf8towcs", UTF8_INVALID_UTF8);
#endif /* USE_SETERROR2 */
			return (size_t)-1;
		}
		if(dest) {
			*dest = *src & mask;
			for(i = 1; i < len; ++i) {
				if((src[i] & 0xc0) != 0x80) { /* 11000000, 10000000 */
					*dest = L'\0';
#ifdef USE_SETERROR2
					set_error("utf8towcs", UTF8_INVALID_UTF8);
#endif /* USE_SETERROR2 */
					return (size_t)-1;
				}
				*dest <<= 6;
				*dest |= src[i] & 0x3f; /* 00111111 */
			}
			++dest;
		}
		src += len;
		++count;
	}
	if(dest && count < n)
		*dest = L'\0';
	return count;
#endif /* !_WIN32 */
}

#ifdef TEST_UTF8_C

#include <assert.h>
#include <stdio.h>
#include <string.h>

int main(int argc, char** argv) {
	wchar_t src[] = {0x61,0xe4,0xfb02,0x394,0x0};
	char cdest[16];
	wchar_t wdest[16];
	unsigned char* ca;
	wchar_t* wa;
	size_t ret;
	int i, j;

	if(argc > 1) {
		src[0] = (wchar_t)atoi(argv[1]);
		src[1] = L'\0';
		wcstoutf8(cdest, src, 16);
		printf("UTF8 of %x = \"%s\"\r\n", src[0], cdest);
		return 0;
	}
		
	
	printf("(be aware of hidden chars, pipe it through od/cat-A/less)\r\n");
	printf("src = {61,e4,fb02,394} (a, a-diaer, Latin-fl, Greek-D)\r\n");
	
	printf("\r\nwcstoutf8:\r\n\r\n");
	strcpy(cdest, "(untouched)");
	ret = wcstoutf8(NULL, src, 0);
	printf("wsctoutf8(NULL, src, 0) = %u  <---\r\n", ret);
	j = (int)ret;
	for(i = 0; i <= j + 2; ++i) {
		ret = wcstoutf8(cdest, src, i);
		printf("wsctoutf8(\"%s\", src, %i) = %u%s\r\n", cdest, i, ret,
				i == j + 1 ? "  <---" : "");
	}
	assert(strcmp(cdest, "\x61\xc3\xa4\xef\xac\x82\xce\x94") == 0);
	assert(wcstoautf8(&ca, src) != (size_t)-1);
	assert(strcmp(cdest, ca) == 0);
	free(ca);

	printf("\r\nutf8towcs:\r\n\r\n");
	wcscpy(wdest, L"\xff\xff\xff\xff\xff");
	ret = utf8towcs(NULL, cdest, 0);
	printf("utf8towcs(NULL, cdest, 0) = %u  <---\r\n", ret);
	j = (int)ret;
	for(i = 0; i <= j + 2; ++i) {
		ret = utf8towcs(wdest, cdest, i);
		printf("utf8towcs({%lx,%lx,%lx,%lx,%lx}, cdest, %i) = %u%s\r\n",
				wdest[0], wdest[1], wdest[2], wdest[3], wdest[4], i, ret,
				i == j + 1 ? "  <---" : "");
	}
	assert(memcmp(src, wdest, sizeof(wchar_t) * 5) == 0);
	assert(utf8toawcs(&wa, cdest) != (size_t)-1);
	assert(wcscmp(wdest, wa) == 0);
	free(wa);
	
	return 0;
}

/*** OUTPUT PIPED THROUGH cat -v ***

(be aware of hidden chars, pipe it through od/cat-A/less)^M
src = {61,e4,fb02,394} (a, a-diaer, Latin-fl, Greek-D)^M
^M
wcstoutf8:^M
^M
wsctoutf8(NULL, src, 0) = 8  <---^M
wsctoutf8("(untouched)", src, 0) = 0^M
wsctoutf8("auntouched)", src, 1) = 1^M
wsctoutf8("a", src, 2) = 1^M
wsctoutf8("aM-CM-$touched)", src, 3) = 3^M
wsctoutf8("aM-CM-$", src, 4) = 3^M
wsctoutf8("aM-CM-$", src, 5) = 3^M
wsctoutf8("aM-CM-$M-oM-,M-^Bched)", src, 6) = 6^M
wsctoutf8("aM-CM-$M-oM-,M-^B", src, 7) = 6^M
wsctoutf8("aM-CM-$M-oM-,M-^BM-NM-^Ted)", src, 8) = 8^M
wsctoutf8("aM-CM-$M-oM-,M-^BM-NM-^T", src, 9) = 8  <---^M
wsctoutf8("aM-CM-$M-oM-,M-^BM-NM-^T", src, 10) = 8^M
^M
utf8towcs:^M
^M
utf8towcs(NULL, cdest, 0) = 4  <---^M
utf8towcs({ff,ff,ff,ff,ff}, cdest, 0) = 0^M
utf8towcs({61,ff,ff,ff,ff}, cdest, 1) = 1^M
utf8towcs({61,e4,ff,ff,ff}, cdest, 2) = 2^M
utf8towcs({61,e4,fb02,ff,ff}, cdest, 3) = 3^M
utf8towcs({61,e4,fb02,394,ff}, cdest, 4) = 4^M
utf8towcs({61,e4,fb02,394,0}, cdest, 5) = 4  <---^M
utf8towcs({61,e4,fb02,394,0}, cdest, 6) = 4^M

***********************************/

#endif // TEST_UTF8_C
