2011-09-15 08:07:42 +00:00
|
|
|
/*
|
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
|
* as published by the Free Software Foundation; either version 2
|
|
|
|
|
* of the License, or (at your option) any later version.
|
|
|
|
|
*
|
|
|
|
|
* This program is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
|
* GNU General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
|
* along with this program; if not, write to the Free Software Foundation,
|
|
|
|
|
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
|
|
|
|
*
|
|
|
|
|
* The Original Code is Copyright (C) 2011 Blender Foundation.
|
2011-10-21 01:46:03 +00:00
|
|
|
* Code from gutf8.c Copyright (C) 1999 Tom Tromey
|
|
|
|
|
* Copyright (C) 2000 Red Hat, Inc.
|
2011-09-15 08:07:42 +00:00
|
|
|
* All rights reserved.
|
|
|
|
|
*/
|
2011-09-15 16:37:36 +00:00
|
|
|
|
2019-02-18 08:08:12 +11:00
|
|
|
/** \file
|
|
|
|
|
* \ingroup bli
|
2019-02-06 15:52:04 +11:00
|
|
|
*/
|
2011-09-15 16:37:36 +00:00
|
|
|
|
2020-03-19 09:33:03 +01:00
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <stdlib.h>
|
2011-09-15 08:07:42 +00:00
|
|
|
#include <string.h>
|
2011-10-20 09:47:05 +00:00
|
|
|
#include <wchar.h>
|
|
|
|
|
#include <wctype.h>
|
2013-03-12 07:25:53 +00:00
|
|
|
#include <wcwidth.h>
|
2011-09-15 08:07:42 +00:00
|
|
|
|
2012-10-31 04:28:49 +00:00
|
|
|
#include "BLI_utildefines.h"
|
|
|
|
|
|
|
|
|
|
#include "BLI_string_utf8.h" /* own include */
|
2021-01-26 14:56:39 -07:00
|
|
|
#ifdef WIN32
|
|
|
|
|
# include "utfconv.h"
|
|
|
|
|
#endif
|
2013-05-12 06:33:21 +00:00
|
|
|
#ifdef __GNUC__
|
|
|
|
|
# pragma GCC diagnostic error "-Wsign-conversion"
|
|
|
|
|
#endif
|
|
|
|
|
|
2013-07-15 03:54:57 +00:00
|
|
|
// #define DEBUG_STRSIZE
|
|
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
/* array copied from glib's gutf8.c, */
|
2021-07-03 23:08:40 +10:00
|
|
|
/* NOTE: last two values (0xfe and 0xff) are forbidden in utf-8,
|
2019-01-15 23:15:58 +11:00
|
|
|
* so they are considered 1 byte length too. */
|
2017-01-01 02:15:42 +01:00
|
|
|
static const size_t utf8_skip_data[256] = {
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
|
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
|
|
|
|
|
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
|
2019-02-03 14:01:45 +11:00
|
|
|
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 1, 1,
|
2017-01-01 02:15:42 +01:00
|
|
|
};
|
|
|
|
|
|
2011-09-15 08:07:42 +00:00
|
|
|
/* from libswish3, originally called u8_isvalid(),
|
|
|
|
|
* modified to return the index of the bad character (byte index not utf).
|
|
|
|
|
* http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
|
|
|
|
|
|
|
|
|
|
/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
|
2012-03-03 20:19:11 +00:00
|
|
|
*
|
|
|
|
|
* length is in bytes, since without knowing whether the string is valid
|
|
|
|
|
* it's hard to know how many characters there are! */
|
2011-09-15 08:07:42 +00:00
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
/**
|
|
|
|
|
* Find first utf-8 invalid byte in given \a str, of \a length bytes.
|
|
|
|
|
*
|
|
|
|
|
* \return the offset of the first invalid byte.
|
|
|
|
|
*/
|
2021-08-23 15:01:50 +10:00
|
|
|
ptrdiff_t BLI_str_utf8_invalid_byte(const char *str, size_t length)
|
2011-09-15 08:07:42 +00:00
|
|
|
{
|
2017-01-01 02:15:42 +01:00
|
|
|
const unsigned char *p, *perr, *pend = (const unsigned char *)str + length;
|
2011-09-15 08:07:42 +00:00
|
|
|
unsigned char c;
|
|
|
|
|
int ab;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
for (p = (const unsigned char *)str; p < pend; p++, length--) {
|
2011-09-15 08:07:42 +00:00
|
|
|
c = *p;
|
2017-01-01 02:15:42 +01:00
|
|
|
perr = p; /* Erroneous char is always the first of an invalid utf8 sequence... */
|
2019-03-27 13:16:10 +11:00
|
|
|
if (ELEM(c, 0xfe, 0xff, 0x00)) {
|
|
|
|
|
/* Those three values are not allowed in utf8 string. */
|
2017-01-01 02:15:42 +01:00
|
|
|
goto utf8_error;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
|
|
|
|
if (c < 128) {
|
2011-09-15 08:07:42 +00:00
|
|
|
continue;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
|
|
|
|
if ((c & 0xc0) != 0xc0) {
|
2011-09-15 08:07:42 +00:00
|
|
|
goto utf8_error;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2019-01-15 23:15:58 +11:00
|
|
|
/* Note that since we always increase p (and decrease length) by one byte in main loop,
|
|
|
|
|
* we only add/subtract extra utf8 bytes in code below
|
2017-01-01 02:15:42 +01:00
|
|
|
* (ab number, aka number of bytes remaining in the utf8 sequence after the initial one). */
|
2017-01-03 15:30:59 +01:00
|
|
|
ab = (int)utf8_skip_data[c] - 1;
|
2017-01-01 02:15:42 +01:00
|
|
|
if (length <= ab) {
|
2011-09-15 08:07:42 +00:00
|
|
|
goto utf8_error;
|
2017-01-01 02:15:42 +01:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2011-09-15 08:07:42 +00:00
|
|
|
/* Check top bits in the second byte */
|
2017-01-01 02:15:42 +01:00
|
|
|
p++;
|
|
|
|
|
length--;
|
2019-03-27 13:16:10 +11:00
|
|
|
if ((*p & 0xc0) != 0x80) {
|
2011-09-15 08:07:42 +00:00
|
|
|
goto utf8_error;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2011-09-15 08:07:42 +00:00
|
|
|
/* Check for overlong sequences for each different length */
|
|
|
|
|
switch (ab) {
|
2017-01-01 02:15:42 +01:00
|
|
|
case 1:
|
|
|
|
|
/* Check for xx00 000x */
|
2019-03-27 13:16:10 +11:00
|
|
|
if ((c & 0x3e) == 0) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
continue; /* We know there aren't any more bytes to check */
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
case 2:
|
|
|
|
|
/* Check for 1110 0000, xx0x xxxx */
|
2019-03-27 13:16:10 +11:00
|
|
|
if (c == 0xe0 && (*p & 0x20) == 0) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
/* Some special cases, see section 5 of utf-8 decoder stress-test by Markus Kuhn
|
|
|
|
|
* (https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt). */
|
|
|
|
|
/* From section 5.1 (and 5.2) */
|
|
|
|
|
if (c == 0xed) {
|
2019-03-27 13:16:10 +11:00
|
|
|
if (*p == 0xa0 && *(p + 1) == 0x80) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xad && *(p + 1) == 0xbf) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xae && *(p + 1) == 0x80) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xaf && *(p + 1) == 0xbf) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xb0 && *(p + 1) == 0x80) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xbe && *(p + 1) == 0x80) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xbf && *(p + 1) == 0xbf) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
}
|
|
|
|
|
/* From section 5.3 */
|
|
|
|
|
if (c == 0xef) {
|
2019-03-27 13:16:10 +11:00
|
|
|
if (*p == 0xbf && *(p + 1) == 0xbe) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
|
|
|
|
if (*p == 0xbf && *(p + 1) == 0xbf) {
|
|
|
|
|
goto utf8_error;
|
2017-01-01 02:15:42 +01:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
break;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
case 3:
|
|
|
|
|
/* Check for 1111 0000, xx00 xxxx */
|
2019-03-27 13:16:10 +11:00
|
|
|
if (c == 0xf0 && (*p & 0x30) == 0) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
break;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
case 4:
|
|
|
|
|
/* Check for 1111 1000, xx00 0xxx */
|
2019-03-27 13:16:10 +11:00
|
|
|
if (c == 0xf8 && (*p & 0x38) == 0) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
break;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2017-01-01 02:15:42 +01:00
|
|
|
case 5:
|
|
|
|
|
/* Check for 1111 1100, xx00 00xx */
|
2019-03-27 13:16:10 +11:00
|
|
|
if (c == 0xfc && (*p & 0x3c) == 0) {
|
|
|
|
|
goto utf8_error;
|
|
|
|
|
}
|
2017-01-01 02:15:42 +01:00
|
|
|
break;
|
2011-09-15 08:07:42 +00:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2011-09-15 08:07:42 +00:00
|
|
|
/* Check for valid bytes after the 2nd, if any; all must start 10 */
|
|
|
|
|
while (--ab > 0) {
|
2017-01-01 02:15:42 +01:00
|
|
|
p++;
|
|
|
|
|
length--;
|
2019-03-27 13:16:10 +11:00
|
|
|
if ((*p & 0xc0) != 0x80) {
|
|
|
|
|
goto utf8_error;
|
2011-09-15 08:07:42 +00:00
|
|
|
}
|
|
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
}
|
|
|
|
|
|
2011-09-15 08:07:42 +00:00
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
utf8_error:
|
|
|
|
|
|
2017-01-20 13:03:21 +01:00
|
|
|
return ((const char *)perr - (const char *)str);
|
2011-09-15 08:07:42 +00:00
|
|
|
}
|
|
|
|
|
|
2017-01-20 13:03:21 +01:00
|
|
|
/**
|
|
|
|
|
* Remove any invalid utf-8 byte (taking into account multi-bytes sequence of course).
|
|
|
|
|
*
|
2017-06-19 09:33:23 +10:00
|
|
|
* \return number of stripped bytes.
|
2017-01-20 13:03:21 +01:00
|
|
|
*/
|
2021-08-23 15:01:50 +10:00
|
|
|
int BLI_str_utf8_invalid_strip(char *str, size_t length)
|
2011-09-15 08:07:42 +00:00
|
|
|
{
|
2017-01-20 21:57:48 +01:00
|
|
|
ptrdiff_t bad_char;
|
2017-01-20 13:03:21 +01:00
|
|
|
int tot = 0;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-01-08 14:46:06 +11:00
|
|
|
BLI_assert(str[length] == '\0');
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2021-08-23 15:01:50 +10:00
|
|
|
while ((bad_char = BLI_str_utf8_invalid_byte(str, length)) != -1) {
|
2011-09-15 08:07:42 +00:00
|
|
|
str += bad_char;
|
2017-01-20 13:03:21 +01:00
|
|
|
length -= (size_t)(bad_char + 1);
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2012-03-24 06:18:31 +00:00
|
|
|
if (length == 0) {
|
2011-09-15 08:07:42 +00:00
|
|
|
/* last character bad, strip it */
|
2012-10-23 03:38:26 +00:00
|
|
|
*str = '\0';
|
2011-09-15 08:07:42 +00:00
|
|
|
tot++;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2020-08-07 11:23:02 +02:00
|
|
|
/* strip, keep looking */
|
|
|
|
|
memmove(str, str + 1, length + 1); /* +1 for NULL char! */
|
|
|
|
|
tot++;
|
2011-09-15 08:07:42 +00:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2011-09-15 08:07:42 +00:00
|
|
|
return tot;
|
|
|
|
|
}
|
2011-09-15 11:49:36 +00:00
|
|
|
|
2021-02-05 16:23:34 +11:00
|
|
|
/** Compatible with #BLI_strncpy, but ensure no partial UTF8 chars. */
|
2011-11-21 11:53:29 +00:00
|
|
|
#define BLI_STR_UTF8_CPY(dst, src, maxncpy) \
|
|
|
|
|
{ \
|
|
|
|
|
size_t utf8_size; \
|
2012-10-23 03:38:26 +00:00
|
|
|
while (*src != '\0' && (utf8_size = utf8_skip_data[*src]) < maxncpy) { \
|
2011-11-21 11:53:29 +00:00
|
|
|
maxncpy -= utf8_size; \
|
2012-10-23 03:38:26 +00:00
|
|
|
switch (utf8_size) { \
|
2017-05-20 14:01:03 +10:00
|
|
|
case 6: \
|
|
|
|
|
*dst++ = *src++; \
|
|
|
|
|
ATTR_FALLTHROUGH; \
|
2019-04-17 06:17:24 +02:00
|
|
|
case 5: \
|
2017-05-20 14:01:03 +10:00
|
|
|
*dst++ = *src++; \
|
|
|
|
|
ATTR_FALLTHROUGH; \
|
|
|
|
|
case 4: \
|
|
|
|
|
*dst++ = *src++; \
|
|
|
|
|
ATTR_FALLTHROUGH; \
|
|
|
|
|
case 3: \
|
|
|
|
|
*dst++ = *src++; \
|
|
|
|
|
ATTR_FALLTHROUGH; \
|
|
|
|
|
case 2: \
|
2011-11-21 11:53:29 +00:00
|
|
|
*dst++ = *src++; \
|
2017-05-20 14:01:03 +10:00
|
|
|
ATTR_FALLTHROUGH; \
|
2011-11-21 11:53:29 +00:00
|
|
|
case 1: \
|
|
|
|
|
*dst++ = *src++; \
|
|
|
|
|
} \
|
2019-04-17 06:17:24 +02:00
|
|
|
} \
|
2012-10-23 03:38:26 +00:00
|
|
|
*dst = '\0'; \
|
2012-05-27 20:13:59 +00:00
|
|
|
} \
|
|
|
|
|
(void)0
|
2011-11-21 11:53:29 +00:00
|
|
|
|
2012-11-23 15:12:13 +00:00
|
|
|
char *BLI_strncpy_utf8(char *__restrict dst, const char *__restrict src, size_t maxncpy)
|
2011-09-15 11:49:36 +00:00
|
|
|
{
|
2014-03-16 03:24:05 +11:00
|
|
|
char *r_dst = dst;
|
2011-09-15 11:49:36 +00:00
|
|
|
|
2012-10-31 04:24:55 +00:00
|
|
|
BLI_assert(maxncpy != 0);
|
|
|
|
|
|
2013-07-15 05:09:06 +00:00
|
|
|
#ifdef DEBUG_STRSIZE
|
|
|
|
|
memset(dst, 0xff, sizeof(*dst) * maxncpy);
|
|
|
|
|
#endif
|
|
|
|
|
|
2021-07-03 23:08:40 +10:00
|
|
|
/* NOTE: currently we don't attempt to deal with invalid utf8 chars. */
|
2012-05-27 20:13:59 +00:00
|
|
|
BLI_STR_UTF8_CPY(dst, src, maxncpy);
|
2011-09-15 11:49:36 +00:00
|
|
|
|
2014-03-16 03:24:05 +11:00
|
|
|
return r_dst;
|
2011-09-15 11:49:36 +00:00
|
|
|
}
|
|
|
|
|
|
2014-12-28 15:58:13 +11:00
|
|
|
size_t BLI_strncpy_utf8_rlen(char *__restrict dst, const char *__restrict src, size_t maxncpy)
|
|
|
|
|
{
|
|
|
|
|
char *r_dst = dst;
|
|
|
|
|
|
|
|
|
|
BLI_assert(maxncpy != 0);
|
|
|
|
|
|
|
|
|
|
#ifdef DEBUG_STRSIZE
|
|
|
|
|
memset(dst, 0xff, sizeof(*dst) * maxncpy);
|
|
|
|
|
#endif
|
|
|
|
|
|
2021-07-03 23:08:40 +10:00
|
|
|
/* NOTE: currently we don't attempt to deal with invalid utf8 chars. */
|
2014-12-28 15:58:13 +11:00
|
|
|
BLI_STR_UTF8_CPY(dst, src, maxncpy);
|
|
|
|
|
|
|
|
|
|
return (size_t)(dst - r_dst);
|
|
|
|
|
}
|
|
|
|
|
|
2011-11-21 11:53:29 +00:00
|
|
|
#undef BLI_STR_UTF8_CPY
|
2011-10-20 09:47:05 +00:00
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------*/
|
2021-06-24 15:56:58 +10:00
|
|
|
/* wchar_t / utf8 functions */
|
2011-10-20 09:47:05 +00:00
|
|
|
|
2012-11-23 15:12:13 +00:00
|
|
|
size_t BLI_strncpy_wchar_as_utf8(char *__restrict dst,
|
|
|
|
|
const wchar_t *__restrict src,
|
|
|
|
|
const size_t maxncpy)
|
2011-10-20 09:47:05 +00:00
|
|
|
{
|
2012-10-31 04:24:55 +00:00
|
|
|
BLI_assert(maxncpy != 0);
|
2021-08-28 22:44:55 +10:00
|
|
|
size_t len = 0;
|
2013-07-15 03:54:57 +00:00
|
|
|
#ifdef DEBUG_STRSIZE
|
|
|
|
|
memset(dst, 0xff, sizeof(*dst) * maxncpy);
|
|
|
|
|
#endif
|
2021-08-28 22:44:55 +10:00
|
|
|
while (*src && len < maxncpy) {
|
|
|
|
|
len += BLI_str_utf8_from_unicode((uint)*src++, dst + len, maxncpy - len);
|
2015-02-26 11:20:47 +01:00
|
|
|
}
|
2012-10-22 08:15:51 +00:00
|
|
|
dst[len] = '\0';
|
2021-08-28 22:44:55 +10:00
|
|
|
/* Return the correct length when part of the final byte did not fit into the string. */
|
|
|
|
|
while ((len > 0) && UNLIKELY(dst[len - 1] == '\0')) {
|
|
|
|
|
len--;
|
|
|
|
|
}
|
2011-10-20 09:47:05 +00:00
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* wchar len in utf8 */
|
|
|
|
|
size_t BLI_wstrlen_utf8(const wchar_t *src)
|
|
|
|
|
{
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
|
2012-03-24 06:18:31 +00:00
|
|
|
while (*src) {
|
2021-08-28 22:44:55 +10:00
|
|
|
len += BLI_str_utf8_from_unicode_len((uint)*src++);
|
2011-10-20 09:47:05 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2013-07-23 12:49:30 +00:00
|
|
|
size_t BLI_strlen_utf8_ex(const char *strc, size_t *r_len_bytes)
|
* New string property subtype: PASSWORD
When this new subtypes is used, then string of property is hidden using
asterisks, e.g.: mysecretpassword -> ****************
This code was reviewed and modified by Brecht. Thanks very much:
- https://codereview.appspot.com/6713044/
This new subtype of string property is intended mostly for Add-on developers
writing Add-on which communicates with some server (http, sql, ftp, verse,
etc.). When this server requires user authentication and user has to type
username and password, then current API didn't allow to type 'hidden' password,
e.g. when you want to demonstrate this script, then everybody can see this
security password. Some examples of Add-on which could use this new subtype:
- On-line database of textures
- Integration of render farm
- Integration of Verse
Security Notes:
- You can copy paste hiddent string of property from text input using (Ctrl-C, Ctrl-V),
but you can do this in other GUI toolkits too (this behavior it is widely used).
- Text of string property is stored in plain text, but it is widely used in other
GUI toolkits (Qt, Gtk, etc.).
Simple examples:
- https://dl.dropbox.com/u/369894/draw_op_passwd.py
- https://dl.dropbox.com/u/369894/blender-password.png
2012-10-26 12:58:54 +00:00
|
|
|
{
|
2013-07-09 06:21:45 +00:00
|
|
|
size_t len;
|
|
|
|
|
const char *strc_orig = strc;
|
* New string property subtype: PASSWORD
When this new subtypes is used, then string of property is hidden using
asterisks, e.g.: mysecretpassword -> ****************
This code was reviewed and modified by Brecht. Thanks very much:
- https://codereview.appspot.com/6713044/
This new subtype of string property is intended mostly for Add-on developers
writing Add-on which communicates with some server (http, sql, ftp, verse,
etc.). When this server requires user authentication and user has to type
username and password, then current API didn't allow to type 'hidden' password,
e.g. when you want to demonstrate this script, then everybody can see this
security password. Some examples of Add-on which could use this new subtype:
- On-line database of textures
- Integration of render farm
- Integration of Verse
Security Notes:
- You can copy paste hiddent string of property from text input using (Ctrl-C, Ctrl-V),
but you can do this in other GUI toolkits too (this behavior it is widely used).
- Text of string property is stored in plain text, but it is widely used in other
GUI toolkits (Qt, Gtk, etc.).
Simple examples:
- https://dl.dropbox.com/u/369894/draw_op_passwd.py
- https://dl.dropbox.com/u/369894/blender-password.png
2012-10-26 12:58:54 +00:00
|
|
|
|
2019-03-27 13:16:10 +11:00
|
|
|
for (len = 0; *strc; len++) {
|
2013-07-09 06:21:45 +00:00
|
|
|
strc += BLI_str_utf8_size_safe(strc);
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2013-07-09 06:21:45 +00:00
|
|
|
|
2013-07-23 12:49:30 +00:00
|
|
|
*r_len_bytes = (size_t)(strc - strc_orig);
|
2013-07-09 06:21:45 +00:00
|
|
|
return len;
|
* New string property subtype: PASSWORD
When this new subtypes is used, then string of property is hidden using
asterisks, e.g.: mysecretpassword -> ****************
This code was reviewed and modified by Brecht. Thanks very much:
- https://codereview.appspot.com/6713044/
This new subtype of string property is intended mostly for Add-on developers
writing Add-on which communicates with some server (http, sql, ftp, verse,
etc.). When this server requires user authentication and user has to type
username and password, then current API didn't allow to type 'hidden' password,
e.g. when you want to demonstrate this script, then everybody can see this
security password. Some examples of Add-on which could use this new subtype:
- On-line database of textures
- Integration of render farm
- Integration of Verse
Security Notes:
- You can copy paste hiddent string of property from text input using (Ctrl-C, Ctrl-V),
but you can do this in other GUI toolkits too (this behavior it is widely used).
- Text of string property is stored in plain text, but it is widely used in other
GUI toolkits (Qt, Gtk, etc.).
Simple examples:
- https://dl.dropbox.com/u/369894/draw_op_passwd.py
- https://dl.dropbox.com/u/369894/blender-password.png
2012-10-26 12:58:54 +00:00
|
|
|
}
|
|
|
|
|
|
2011-10-20 09:47:05 +00:00
|
|
|
size_t BLI_strlen_utf8(const char *strc)
|
|
|
|
|
{
|
2020-02-28 14:21:51 +01:00
|
|
|
size_t len_bytes;
|
|
|
|
|
return BLI_strlen_utf8_ex(strc, &len_bytes);
|
* New string property subtype: PASSWORD
When this new subtypes is used, then string of property is hidden using
asterisks, e.g.: mysecretpassword -> ****************
This code was reviewed and modified by Brecht. Thanks very much:
- https://codereview.appspot.com/6713044/
This new subtype of string property is intended mostly for Add-on developers
writing Add-on which communicates with some server (http, sql, ftp, verse,
etc.). When this server requires user authentication and user has to type
username and password, then current API didn't allow to type 'hidden' password,
e.g. when you want to demonstrate this script, then everybody can see this
security password. Some examples of Add-on which could use this new subtype:
- On-line database of textures
- Integration of render farm
- Integration of Verse
Security Notes:
- You can copy paste hiddent string of property from text input using (Ctrl-C, Ctrl-V),
but you can do this in other GUI toolkits too (this behavior it is widely used).
- Text of string property is stored in plain text, but it is widely used in other
GUI toolkits (Qt, Gtk, etc.).
Simple examples:
- https://dl.dropbox.com/u/369894/draw_op_passwd.py
- https://dl.dropbox.com/u/369894/blender-password.png
2012-10-26 12:58:54 +00:00
|
|
|
}
|
|
|
|
|
|
2013-07-23 12:49:30 +00:00
|
|
|
size_t BLI_strnlen_utf8_ex(const char *strc, const size_t maxlen, size_t *r_len_bytes)
|
2013-07-09 06:21:45 +00:00
|
|
|
{
|
2020-02-28 14:21:51 +01:00
|
|
|
size_t len = 0;
|
2013-07-09 06:21:45 +00:00
|
|
|
const char *strc_orig = strc;
|
|
|
|
|
const char *strc_end = strc + maxlen;
|
|
|
|
|
|
2020-02-28 14:21:51 +01:00
|
|
|
while (true) {
|
|
|
|
|
size_t step = (size_t)BLI_str_utf8_size_safe(strc);
|
|
|
|
|
if (!*strc || strc + step > strc_end) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
strc += step;
|
|
|
|
|
len++;
|
2013-07-09 06:21:45 +00:00
|
|
|
}
|
|
|
|
|
|
2013-07-23 12:49:30 +00:00
|
|
|
*r_len_bytes = (size_t)(strc - strc_orig);
|
2013-07-09 06:21:45 +00:00
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2012-10-27 02:47:39 +00:00
|
|
|
/**
|
2015-05-20 12:54:45 +10:00
|
|
|
* \param strc: the string to measure the length.
|
|
|
|
|
* \param maxlen: the string length (in bytes)
|
2012-10-27 02:47:39 +00:00
|
|
|
* \return the unicode length (not in bytes!)
|
|
|
|
|
*/
|
2013-07-09 06:21:45 +00:00
|
|
|
size_t BLI_strnlen_utf8(const char *strc, const size_t maxlen)
|
* New string property subtype: PASSWORD
When this new subtypes is used, then string of property is hidden using
asterisks, e.g.: mysecretpassword -> ****************
This code was reviewed and modified by Brecht. Thanks very much:
- https://codereview.appspot.com/6713044/
This new subtype of string property is intended mostly for Add-on developers
writing Add-on which communicates with some server (http, sql, ftp, verse,
etc.). When this server requires user authentication and user has to type
username and password, then current API didn't allow to type 'hidden' password,
e.g. when you want to demonstrate this script, then everybody can see this
security password. Some examples of Add-on which could use this new subtype:
- On-line database of textures
- Integration of render farm
- Integration of Verse
Security Notes:
- You can copy paste hiddent string of property from text input using (Ctrl-C, Ctrl-V),
but you can do this in other GUI toolkits too (this behavior it is widely used).
- Text of string property is stored in plain text, but it is widely used in other
GUI toolkits (Qt, Gtk, etc.).
Simple examples:
- https://dl.dropbox.com/u/369894/draw_op_passwd.py
- https://dl.dropbox.com/u/369894/blender-password.png
2012-10-26 12:58:54 +00:00
|
|
|
{
|
2020-02-28 14:21:51 +01:00
|
|
|
size_t len_bytes;
|
|
|
|
|
return BLI_strnlen_utf8_ex(strc, maxlen, &len_bytes);
|
2011-10-20 09:47:05 +00:00
|
|
|
}
|
|
|
|
|
|
2012-11-23 15:12:13 +00:00
|
|
|
size_t BLI_strncpy_wchar_from_utf8(wchar_t *__restrict dst_w,
|
|
|
|
|
const char *__restrict src_c,
|
|
|
|
|
const size_t maxncpy)
|
2011-10-20 09:47:05 +00:00
|
|
|
{
|
2021-01-26 14:56:39 -07:00
|
|
|
#ifdef WIN32
|
2021-08-29 12:06:15 +10:00
|
|
|
conv_utf_8_to_16(src_c, dst_w, maxncpy);
|
|
|
|
|
/* NOTE: it would be more efficient to calculate the length as part of #conv_utf_8_to_16. */
|
|
|
|
|
return wcslen(dst_w);
|
2021-01-26 14:56:39 -07:00
|
|
|
#else
|
|
|
|
|
return BLI_str_utf8_as_utf32((char32_t *)dst_w, src_c, maxncpy);
|
2013-07-15 03:54:57 +00:00
|
|
|
#endif
|
2011-10-20 09:47:05 +00:00
|
|
|
}
|
|
|
|
|
|
2021-06-24 15:56:58 +10:00
|
|
|
/* end wchar_t / utf8 functions */
|
2011-10-20 09:47:05 +00:00
|
|
|
/* --------------------------------------------------------------------------*/
|
|
|
|
|
|
2013-03-12 07:25:53 +00:00
|
|
|
/* count columns that character/string occupies, based on wcwidth.c */
|
|
|
|
|
|
2019-11-22 12:26:54 -03:00
|
|
|
int BLI_wcwidth(char32_t ucs)
|
2013-03-12 07:25:53 +00:00
|
|
|
{
|
|
|
|
|
return mk_wcwidth(ucs);
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 12:26:54 -03:00
|
|
|
int BLI_wcswidth(const char32_t *pwcs, size_t n)
|
2013-03-12 07:25:53 +00:00
|
|
|
{
|
|
|
|
|
return mk_wcswidth(pwcs, n);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int BLI_str_utf8_char_width(const char *p)
|
|
|
|
|
{
|
2017-10-28 17:48:45 +11:00
|
|
|
uint unicode = BLI_str_utf8_as_unicode(p);
|
2019-03-27 13:16:10 +11:00
|
|
|
if (unicode == BLI_UTF8_ERR) {
|
2013-03-12 07:25:53 +00:00
|
|
|
return -1;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2013-03-12 07:25:53 +00:00
|
|
|
|
2019-11-22 12:26:54 -03:00
|
|
|
return BLI_wcwidth((char32_t)unicode);
|
2013-03-12 07:25:53 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int BLI_str_utf8_char_width_safe(const char *p)
|
|
|
|
|
{
|
|
|
|
|
int columns;
|
|
|
|
|
|
2017-10-28 17:48:45 +11:00
|
|
|
uint unicode = BLI_str_utf8_as_unicode(p);
|
2019-03-27 13:16:10 +11:00
|
|
|
if (unicode == BLI_UTF8_ERR) {
|
2013-03-12 07:25:53 +00:00
|
|
|
return 1;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2013-03-12 07:25:53 +00:00
|
|
|
|
2019-11-22 12:26:54 -03:00
|
|
|
columns = BLI_wcwidth((char32_t)unicode);
|
2013-03-12 07:25:53 +00:00
|
|
|
|
|
|
|
|
return (columns < 0) ? 1 : columns;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* --------------------------------------------------------------------------*/
|
|
|
|
|
|
2012-11-09 03:36:38 +00:00
|
|
|
/* copied from glib's gutf8.c, added 'Err' arg */
|
2011-10-21 00:01:22 +00:00
|
|
|
|
2021-07-03 23:08:40 +10:00
|
|
|
/* NOTE(campbell): glib uses uint for unicode, best we do the same,
|
|
|
|
|
* though we don't typedef it. */
|
2011-10-21 00:01:22 +00:00
|
|
|
|
2012-11-09 03:36:38 +00:00
|
|
|
#define UTF8_COMPUTE(Char, Mask, Len, Err) \
|
2011-10-21 00:01:22 +00:00
|
|
|
if (Char < 128) { \
|
|
|
|
|
Len = 1; \
|
|
|
|
|
Mask = 0x7f; \
|
|
|
|
|
} \
|
|
|
|
|
else if ((Char & 0xe0) == 0xc0) { \
|
|
|
|
|
Len = 2; \
|
|
|
|
|
Mask = 0x1f; \
|
|
|
|
|
} \
|
|
|
|
|
else if ((Char & 0xf0) == 0xe0) { \
|
|
|
|
|
Len = 3; \
|
|
|
|
|
Mask = 0x0f; \
|
|
|
|
|
} \
|
|
|
|
|
else if ((Char & 0xf8) == 0xf0) { \
|
|
|
|
|
Len = 4; \
|
|
|
|
|
Mask = 0x07; \
|
|
|
|
|
} \
|
|
|
|
|
else if ((Char & 0xfc) == 0xf8) { \
|
|
|
|
|
Len = 5; \
|
|
|
|
|
Mask = 0x03; \
|
|
|
|
|
} \
|
|
|
|
|
else if ((Char & 0xfe) == 0xfc) { \
|
|
|
|
|
Len = 6; \
|
|
|
|
|
Mask = 0x01; \
|
|
|
|
|
} \
|
|
|
|
|
else { \
|
2012-11-09 03:36:38 +00:00
|
|
|
Len = Err; /* -1 is the typical error value or 1 to skip */ \
|
2012-05-27 20:13:59 +00:00
|
|
|
} \
|
|
|
|
|
(void)0
|
2011-10-20 09:47:05 +00:00
|
|
|
|
2011-10-21 01:33:06 +00:00
|
|
|
/* same as glib define but added an 'Err' arg */
|
|
|
|
|
#define UTF8_GET(Result, Chars, Count, Mask, Len, Err) \
|
2011-10-21 00:01:22 +00:00
|
|
|
(Result) = (Chars)[0] & (Mask); \
|
|
|
|
|
for ((Count) = 1; (Count) < (Len); ++(Count)) { \
|
|
|
|
|
if (((Chars)[(Count)] & 0xc0) != 0x80) { \
|
2011-10-21 01:33:06 +00:00
|
|
|
(Result) = Err; \
|
2011-10-21 00:01:22 +00:00
|
|
|
break; \
|
|
|
|
|
} \
|
|
|
|
|
(Result) <<= 6; \
|
|
|
|
|
(Result) |= ((Chars)[(Count)] & 0x3f); \
|
2012-05-27 20:13:59 +00:00
|
|
|
} \
|
|
|
|
|
(void)0
|
2011-10-20 09:47:05 +00:00
|
|
|
|
2011-10-23 13:52:51 +00:00
|
|
|
/* uses glib functions but not from glib */
|
|
|
|
|
/* gets the size of a single utf8 char */
|
|
|
|
|
int BLI_str_utf8_size(const char *p)
|
|
|
|
|
{
|
|
|
|
|
int mask = 0, len;
|
2013-02-19 15:58:38 +00:00
|
|
|
const unsigned char c = (unsigned char)*p;
|
2011-10-23 13:52:51 +00:00
|
|
|
|
2014-03-25 10:10:00 +11:00
|
|
|
UTF8_COMPUTE(c, mask, len, -1);
|
2012-11-09 03:36:38 +00:00
|
|
|
|
|
|
|
|
(void)mask; /* quiet warning */
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* use when we want to skip errors */
|
|
|
|
|
int BLI_str_utf8_size_safe(const char *p)
|
|
|
|
|
{
|
|
|
|
|
int mask = 0, len;
|
2013-02-19 15:58:38 +00:00
|
|
|
const unsigned char c = (unsigned char)*p;
|
2012-11-09 03:36:38 +00:00
|
|
|
|
2014-03-25 10:10:00 +11:00
|
|
|
UTF8_COMPUTE(c, mask, len, 1);
|
2011-10-23 13:52:51 +00:00
|
|
|
|
2011-10-23 17:52:20 +00:00
|
|
|
(void)mask; /* quiet warning */
|
|
|
|
|
|
2011-10-23 13:52:51 +00:00
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
/* was g_utf8_get_char */
|
|
|
|
|
/**
|
|
|
|
|
* BLI_str_utf8_as_unicode:
|
2018-12-12 12:50:58 +11:00
|
|
|
* \param p: a pointer to Unicode character encoded as UTF-8
|
2011-10-21 00:01:22 +00:00
|
|
|
*
|
|
|
|
|
* Converts a sequence of bytes encoded as UTF-8 to a Unicode character.
|
2012-11-26 00:59:11 +00:00
|
|
|
* If \a p does not point to a valid UTF-8 encoded character, results are
|
2011-10-21 00:01:22 +00:00
|
|
|
* undefined. If you are not sure that the bytes are complete
|
|
|
|
|
* valid Unicode characters, you should use g_utf8_get_char_validated()
|
|
|
|
|
* instead.
|
|
|
|
|
*
|
|
|
|
|
* Return value: the resulting character
|
2019-03-19 15:17:46 +11:00
|
|
|
*/
|
2017-10-28 17:48:45 +11:00
|
|
|
uint BLI_str_utf8_as_unicode(const char *p)
|
2011-10-21 00:01:22 +00:00
|
|
|
{
|
2013-05-12 06:33:21 +00:00
|
|
|
int i, len;
|
2017-10-28 17:48:45 +11:00
|
|
|
uint mask = 0;
|
|
|
|
|
uint result;
|
2013-02-19 15:58:38 +00:00
|
|
|
const unsigned char c = (unsigned char)*p;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-03-25 10:10:00 +11:00
|
|
|
UTF8_COMPUTE(c, mask, len, -1);
|
2019-03-27 13:16:10 +11:00
|
|
|
if (UNLIKELY(len == -1)) {
|
2012-03-24 07:36:32 +00:00
|
|
|
return BLI_UTF8_ERR;
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2014-03-25 10:10:00 +11:00
|
|
|
UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2012-03-24 07:36:32 +00:00
|
|
|
return result;
|
2011-10-21 00:01:22 +00:00
|
|
|
}
|
|
|
|
|
|
2021-07-03 23:08:40 +10:00
|
|
|
/**
|
2021-08-25 15:18:57 +10:00
|
|
|
* UTF8 decoding that steps over the index (unless an error is encountered).
|
2021-08-24 13:25:26 +10:00
|
|
|
*
|
|
|
|
|
* \param p: The text to step over.
|
|
|
|
|
* \param p_len: The length of `p`.
|
|
|
|
|
* \param index: Index of `p` to step over.
|
2021-08-25 15:18:57 +10:00
|
|
|
* \return the code-point or #BLI_UTF8_ERR if there is a decoding error.
|
2021-08-24 13:25:26 +10:00
|
|
|
*
|
|
|
|
|
* \note The behavior for clipped text (where `p_len` limits decoding trailing bytes)
|
|
|
|
|
* must have the same behavior is encountering a nil byte,
|
|
|
|
|
* so functions that only use the first part of a string has matching behavior to functions
|
|
|
|
|
* that null terminate the text.
|
2021-07-03 23:08:40 +10:00
|
|
|
*/
|
2021-08-25 15:18:57 +10:00
|
|
|
uint BLI_str_utf8_as_unicode_step_or_error(const char *__restrict p,
|
|
|
|
|
const size_t p_len,
|
|
|
|
|
size_t *__restrict index)
|
2011-10-21 00:01:22 +00:00
|
|
|
{
|
2013-05-12 06:33:21 +00:00
|
|
|
int i, len;
|
2017-10-28 17:48:45 +11:00
|
|
|
uint mask = 0;
|
|
|
|
|
uint result;
|
2021-08-25 15:18:57 +10:00
|
|
|
const unsigned char c = (unsigned char)*(p += *index);
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2021-08-24 13:25:26 +10:00
|
|
|
BLI_assert(*index < p_len);
|
|
|
|
|
BLI_assert(c != '\0');
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-03-25 10:10:00 +11:00
|
|
|
UTF8_COMPUTE(c, mask, len, -1);
|
2021-08-25 15:18:57 +10:00
|
|
|
if (UNLIKELY(len == -1) || (*index + (size_t)len > p_len)) {
|
|
|
|
|
return BLI_UTF8_ERR;
|
2011-10-21 01:33:06 +00:00
|
|
|
}
|
2021-08-25 15:18:57 +10:00
|
|
|
UTF8_GET(result, p, i, mask, len, BLI_UTF8_ERR);
|
|
|
|
|
if (UNLIKELY(result == BLI_UTF8_ERR)) {
|
|
|
|
|
return BLI_UTF8_ERR;
|
2021-08-24 13:25:26 +10:00
|
|
|
}
|
2013-05-12 06:33:21 +00:00
|
|
|
*index += (size_t)len;
|
2021-08-24 13:25:26 +10:00
|
|
|
BLI_assert(*index <= p_len);
|
2011-10-21 00:01:22 +00:00
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-25 15:18:57 +10:00
|
|
|
/**
|
|
|
|
|
* UTF8 decoding that steps over the index (unless an error is encountered).
|
|
|
|
|
*
|
|
|
|
|
* \param p: The text to step over.
|
|
|
|
|
* \param p_len: The length of `p`.
|
|
|
|
|
* \param index: Index of `p` to step over.
|
|
|
|
|
* \return the code-point `(p + *index)` if there is a decoding error.
|
|
|
|
|
*
|
|
|
|
|
* \note Falls back to `LATIN1` for text drawing.
|
|
|
|
|
*/
|
|
|
|
|
uint BLI_str_utf8_as_unicode_step(const char *__restrict p,
|
|
|
|
|
const size_t p_len,
|
|
|
|
|
size_t *__restrict index)
|
|
|
|
|
{
|
|
|
|
|
uint result = BLI_str_utf8_as_unicode_step_or_error(p, p_len, index);
|
|
|
|
|
if (UNLIKELY(result == BLI_UTF8_ERR)) {
|
|
|
|
|
result = (uint)p[*index];
|
|
|
|
|
*index += 1;
|
|
|
|
|
}
|
|
|
|
|
BLI_assert(*index <= p_len);
|
|
|
|
|
return result;
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
/* was g_unichar_to_utf8 */
|
2021-08-28 22:44:55 +10:00
|
|
|
|
|
|
|
|
#define UTF8_VARS_FROM_CHAR32(Char, First, Len) \
|
|
|
|
|
if (Char < 0x80) { \
|
|
|
|
|
First = 0; \
|
|
|
|
|
Len = 1; \
|
|
|
|
|
} \
|
|
|
|
|
else if (Char < 0x800) { \
|
|
|
|
|
First = 0xc0; \
|
|
|
|
|
Len = 2; \
|
|
|
|
|
} \
|
|
|
|
|
else if (Char < 0x10000) { \
|
|
|
|
|
First = 0xe0; \
|
|
|
|
|
Len = 3; \
|
|
|
|
|
} \
|
|
|
|
|
else if (Char < 0x200000) { \
|
|
|
|
|
First = 0xf0; \
|
|
|
|
|
Len = 4; \
|
|
|
|
|
} \
|
|
|
|
|
else if (Char < 0x4000000) { \
|
|
|
|
|
First = 0xf8; \
|
|
|
|
|
Len = 5; \
|
|
|
|
|
} \
|
|
|
|
|
else { \
|
|
|
|
|
First = 0xfc; \
|
|
|
|
|
Len = 6; \
|
|
|
|
|
} \
|
|
|
|
|
(void)0
|
|
|
|
|
|
|
|
|
|
size_t BLI_str_utf8_from_unicode_len(const uint c)
|
|
|
|
|
{
|
|
|
|
|
/* If this gets modified, also update the copy in g_string_insert_unichar() */
|
|
|
|
|
uint len = 0;
|
|
|
|
|
uint first;
|
|
|
|
|
|
|
|
|
|
UTF8_VARS_FROM_CHAR32(c, first, len);
|
|
|
|
|
(void)first;
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
/**
|
|
|
|
|
* BLI_str_utf8_from_unicode:
|
2021-08-28 22:44:55 +10:00
|
|
|
*
|
2015-05-20 12:54:45 +10:00
|
|
|
* \param c: a Unicode character code
|
2021-08-28 22:44:55 +10:00
|
|
|
* \param outbuf: output buffer, must have at least `outbuf_len` bytes of space.
|
|
|
|
|
* If the length required by `c` exceeds `outbuf_len`,
|
|
|
|
|
* the bytes available bytes will be zeroed and `outbuf_len` returned.
|
2011-10-21 00:01:22 +00:00
|
|
|
*
|
|
|
|
|
* Converts a single character to UTF-8.
|
|
|
|
|
*
|
2021-08-28 22:44:55 +10:00
|
|
|
* \return number of bytes written.
|
2019-03-19 15:17:46 +11:00
|
|
|
*/
|
2021-08-28 22:44:55 +10:00
|
|
|
size_t BLI_str_utf8_from_unicode(uint c, char *outbuf, const size_t outbuf_len)
|
|
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
{
|
|
|
|
|
/* If this gets modified, also update the copy in g_string_insert_unichar() */
|
2017-10-28 17:48:45 +11:00
|
|
|
uint len = 0;
|
|
|
|
|
uint first;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2021-08-28 22:44:55 +10:00
|
|
|
UTF8_VARS_FROM_CHAR32(c, first, len);
|
|
|
|
|
|
|
|
|
|
if (UNLIKELY(outbuf_len < len)) {
|
|
|
|
|
/* NULL terminate instead of writing a partial byte. */
|
|
|
|
|
memset(outbuf, 0x0, outbuf_len);
|
|
|
|
|
return outbuf_len;
|
2011-10-21 00:01:22 +00:00
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2021-08-28 22:44:55 +10:00
|
|
|
for (uint i = len - 1; i > 0; i--) {
|
|
|
|
|
outbuf[i] = (c & 0x3f) | 0x80;
|
|
|
|
|
c >>= 6;
|
2011-10-21 00:01:22 +00:00
|
|
|
}
|
2021-08-28 22:44:55 +10:00
|
|
|
outbuf[0] = c | first;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2019-11-22 12:26:54 -03:00
|
|
|
size_t BLI_str_utf8_as_utf32(char32_t *__restrict dst_w,
|
|
|
|
|
const char *__restrict src_c,
|
|
|
|
|
const size_t maxncpy)
|
|
|
|
|
{
|
|
|
|
|
const size_t maxlen = maxncpy - 1;
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
|
|
|
|
|
BLI_assert(maxncpy != 0);
|
|
|
|
|
|
|
|
|
|
#ifdef DEBUG_STRSIZE
|
|
|
|
|
memset(dst_w, 0xff, sizeof(*dst_w) * maxncpy);
|
|
|
|
|
#endif
|
|
|
|
|
|
2021-08-25 15:19:00 +10:00
|
|
|
const size_t src_c_len = strlen(src_c);
|
|
|
|
|
const char *src_c_end = src_c + src_c_len;
|
|
|
|
|
size_t index = 0;
|
2021-08-25 15:36:40 +10:00
|
|
|
while ((index < src_c_len) && (len != maxlen)) {
|
2021-08-25 15:19:00 +10:00
|
|
|
const uint unicode = BLI_str_utf8_as_unicode_step_or_error(src_c, src_c_len, &index);
|
2019-11-22 12:26:54 -03:00
|
|
|
if (unicode != BLI_UTF8_ERR) {
|
|
|
|
|
*dst_w = unicode;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
*dst_w = '?';
|
2021-08-25 15:19:00 +10:00
|
|
|
const char *src_c_next = BLI_str_find_next_char_utf8(src_c + index, src_c_end);
|
2021-08-27 16:42:31 +10:00
|
|
|
index = (size_t)(src_c_next - src_c);
|
2019-11-22 12:26:54 -03:00
|
|
|
}
|
|
|
|
|
dst_w++;
|
|
|
|
|
len++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*dst_w = 0;
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
size_t BLI_str_utf32_as_utf8(char *__restrict dst,
|
|
|
|
|
const char32_t *__restrict src,
|
|
|
|
|
const size_t maxncpy)
|
|
|
|
|
{
|
|
|
|
|
BLI_assert(maxncpy != 0);
|
2021-08-28 22:44:55 +10:00
|
|
|
size_t len = 0;
|
2019-11-22 12:26:54 -03:00
|
|
|
#ifdef DEBUG_STRSIZE
|
|
|
|
|
memset(dst, 0xff, sizeof(*dst) * maxncpy);
|
|
|
|
|
#endif
|
2021-08-28 22:44:55 +10:00
|
|
|
while (*src && len < maxncpy) {
|
|
|
|
|
len += BLI_str_utf8_from_unicode((uint)*src++, dst + len, maxncpy - len);
|
2019-11-22 12:26:54 -03:00
|
|
|
}
|
|
|
|
|
dst[len] = '\0';
|
2021-08-28 22:44:55 +10:00
|
|
|
/* Return the correct length when part of the final byte did not fit into the string. */
|
|
|
|
|
while ((len > 0) && UNLIKELY(dst[len - 1] == '\0')) {
|
|
|
|
|
len--;
|
|
|
|
|
}
|
2019-11-22 12:26:54 -03:00
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* utf32 len in utf8 */
|
|
|
|
|
size_t BLI_str_utf32_as_utf8_len(const char32_t *src)
|
|
|
|
|
{
|
|
|
|
|
size_t len = 0;
|
|
|
|
|
|
|
|
|
|
while (*src) {
|
2021-08-28 22:44:55 +10:00
|
|
|
len += BLI_str_utf8_from_unicode_len((uint)*src++);
|
2019-11-22 12:26:54 -03:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return len;
|
|
|
|
|
}
|
|
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
/* was g_utf8_find_prev_char */
|
2011-10-16 12:25:42 +00:00
|
|
|
/**
|
2011-10-21 00:01:22 +00:00
|
|
|
* BLI_str_find_prev_char_utf8:
|
2018-12-12 12:50:58 +11:00
|
|
|
* \param str: pointer to the beginning of a UTF-8 encoded string
|
|
|
|
|
* \param p: pointer to some position within \a str
|
2011-10-16 12:25:42 +00:00
|
|
|
*
|
2012-11-26 00:59:11 +00:00
|
|
|
* Given a position \a p with a UTF-8 encoded string \a str, find the start
|
2021-08-27 16:42:31 +10:00
|
|
|
* of the previous UTF-8 character starting before. \a p Returns \a str_start if no
|
|
|
|
|
* UTF-8 characters are present in \a str_start before \a p.
|
2011-10-16 12:25:42 +00:00
|
|
|
*
|
2012-11-26 00:59:11 +00:00
|
|
|
* \a p does not have to be at the beginning of a UTF-8 character. No check
|
2011-10-16 12:25:42 +00:00
|
|
|
* is made to see if the character found is actually valid other than
|
|
|
|
|
* it starts with an appropriate byte.
|
|
|
|
|
*
|
2021-08-27 16:42:31 +10:00
|
|
|
* \return A pointer to the found character.
|
2019-03-19 15:17:46 +11:00
|
|
|
*/
|
2021-08-27 16:42:31 +10:00
|
|
|
const char *BLI_str_find_prev_char_utf8(const char *p, const char *str_start)
|
2011-10-16 12:25:42 +00:00
|
|
|
{
|
2021-08-27 16:42:31 +10:00
|
|
|
BLI_assert(p >= str_start);
|
|
|
|
|
if (str_start < p) {
|
|
|
|
|
for (--p; p >= str_start; p--) {
|
|
|
|
|
if ((*p & 0xc0) != 0x80) {
|
|
|
|
|
return (char *)p;
|
|
|
|
|
}
|
2011-10-16 12:25:42 +00:00
|
|
|
}
|
|
|
|
|
}
|
2021-08-27 16:42:31 +10:00
|
|
|
return p;
|
2011-10-16 12:25:42 +00:00
|
|
|
}
|
|
|
|
|
|
2011-10-21 00:01:22 +00:00
|
|
|
/* was g_utf8_find_next_char */
|
2011-10-16 12:25:42 +00:00
|
|
|
/**
|
2011-10-21 00:01:22 +00:00
|
|
|
* BLI_str_find_next_char_utf8:
|
2018-12-12 12:50:58 +11:00
|
|
|
* \param p: a pointer to a position within a UTF-8 encoded string
|
2021-08-27 16:42:31 +10:00
|
|
|
* \param end: a pointer to the byte following the end of the string.
|
2011-10-16 12:25:42 +00:00
|
|
|
*
|
2012-11-26 00:59:11 +00:00
|
|
|
* Finds the start of the next UTF-8 character in the string after \a p
|
2011-10-16 12:25:42 +00:00
|
|
|
*
|
2012-11-26 00:59:11 +00:00
|
|
|
* \a p does not have to be at the beginning of a UTF-8 character. No check
|
2011-10-16 12:25:42 +00:00
|
|
|
* is made to see if the character found is actually valid other than
|
|
|
|
|
* it starts with an appropriate byte.
|
|
|
|
|
*
|
2021-08-27 16:42:31 +10:00
|
|
|
* \return a pointer to the found character or a pointer to the null terminating character '\0'.
|
2019-03-19 15:17:46 +11:00
|
|
|
*/
|
2021-08-27 16:42:31 +10:00
|
|
|
const char *BLI_str_find_next_char_utf8(const char *p, const char *str_end)
|
2011-10-16 12:25:42 +00:00
|
|
|
{
|
2021-08-27 16:42:31 +10:00
|
|
|
BLI_assert(p <= str_end);
|
|
|
|
|
if ((p < str_end) && (*p != '\0')) {
|
|
|
|
|
for (++p; p < str_end && (*p & 0xc0) == 0x80; p++) {
|
|
|
|
|
/* do nothing */
|
2011-10-16 12:25:42 +00:00
|
|
|
}
|
|
|
|
|
}
|
2021-08-27 16:42:31 +10:00
|
|
|
return p;
|
2011-10-16 12:25:42 +00:00
|
|
|
}
|
2014-07-04 14:14:06 +02:00
|
|
|
|
2017-10-28 17:48:45 +11:00
|
|
|
size_t BLI_str_partition_utf8(const char *str,
|
|
|
|
|
const uint delim[],
|
|
|
|
|
const char **sep,
|
|
|
|
|
const char **suf)
|
2014-07-04 14:14:06 +02:00
|
|
|
{
|
2015-06-27 10:22:29 +02:00
|
|
|
return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, false);
|
2014-07-04 14:14:06 +02:00
|
|
|
}
|
|
|
|
|
|
2017-10-28 17:48:45 +11:00
|
|
|
size_t BLI_str_rpartition_utf8(const char *str,
|
|
|
|
|
const uint delim[],
|
|
|
|
|
const char **sep,
|
|
|
|
|
const char **suf)
|
2014-07-04 14:14:06 +02:00
|
|
|
{
|
2015-06-27 10:22:29 +02:00
|
|
|
return BLI_str_partition_ex_utf8(str, NULL, delim, sep, suf, true);
|
2014-07-04 14:14:06 +02:00
|
|
|
}
|
|
|
|
|
|
2015-06-27 10:22:29 +02:00
|
|
|
size_t BLI_str_partition_ex_utf8(const char *str,
|
2017-10-28 17:48:45 +11:00
|
|
|
const char *end,
|
|
|
|
|
const uint delim[],
|
|
|
|
|
const char **sep,
|
|
|
|
|
const char **suf,
|
|
|
|
|
const bool from_right)
|
2014-07-04 14:14:06 +02:00
|
|
|
{
|
2015-06-27 10:22:29 +02:00
|
|
|
const size_t str_len = end ? (size_t)(end - str) : strlen(str);
|
2021-08-27 16:42:31 +10:00
|
|
|
if (end == NULL) {
|
|
|
|
|
end = str + str_len;
|
|
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2015-06-27 10:22:29 +02:00
|
|
|
/* Note that here, we assume end points to a valid utf8 char! */
|
2021-08-27 16:42:31 +10:00
|
|
|
BLI_assert((end >= str) && (BLI_str_utf8_as_unicode(end) != BLI_UTF8_ERR));
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-07-04 14:14:06 +02:00
|
|
|
*suf = (char *)(str + str_len);
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2021-08-27 16:42:31 +10:00
|
|
|
size_t index;
|
|
|
|
|
for (*sep = (char *)(from_right ? BLI_str_find_prev_char_utf8(end, str) : str), index = 0;
|
|
|
|
|
from_right ? (*sep > str) : ((*sep < end) && (**sep != '\0'));
|
|
|
|
|
*sep = (char *)(from_right ? (str != *sep ? BLI_str_find_prev_char_utf8(*sep, str) : NULL) :
|
|
|
|
|
str + index)) {
|
2021-08-25 15:19:00 +10:00
|
|
|
size_t index_ofs = 0;
|
|
|
|
|
const uint c = BLI_str_utf8_as_unicode_step_or_error(*sep, (size_t)(end - *sep), &index_ofs);
|
|
|
|
|
index += index_ofs;
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-07-04 14:14:06 +02:00
|
|
|
if (c == BLI_UTF8_ERR) {
|
|
|
|
|
*suf = *sep = NULL;
|
|
|
|
|
break;
|
|
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2021-08-27 16:42:31 +10:00
|
|
|
for (const uint *d = delim; *d != '\0'; d++) {
|
2014-07-04 14:14:06 +02:00
|
|
|
if (*d == c) {
|
|
|
|
|
/* *suf is already correct in case from_right is true. */
|
2019-03-27 13:16:10 +11:00
|
|
|
if (!from_right) {
|
2014-07-04 14:14:06 +02:00
|
|
|
*suf = (char *)(str + index);
|
2019-03-27 13:16:10 +11:00
|
|
|
}
|
2014-07-04 14:14:06 +02:00
|
|
|
return (size_t)(*sep - str);
|
|
|
|
|
}
|
|
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-07-04 14:14:06 +02:00
|
|
|
*suf = *sep; /* Useful in 'from_right' case! */
|
|
|
|
|
}
|
2019-04-17 06:17:24 +02:00
|
|
|
|
2014-07-04 14:14:06 +02:00
|
|
|
*suf = *sep = NULL;
|
|
|
|
|
return str_len;
|
|
|
|
|
}
|
2019-08-06 17:16:27 +10:00
|
|
|
|
|
|
|
|
/* -------------------------------------------------------------------- */
|
|
|
|
|
/** \name Offset Conversion in Strings
|
|
|
|
|
* \{ */
|
|
|
|
|
|
|
|
|
|
int BLI_str_utf8_offset_to_index(const char *str, int offset)
|
|
|
|
|
{
|
|
|
|
|
int index = 0, pos = 0;
|
|
|
|
|
while (pos != offset) {
|
|
|
|
|
pos += BLI_str_utf8_size(str + pos);
|
|
|
|
|
index++;
|
|
|
|
|
}
|
|
|
|
|
return index;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int BLI_str_utf8_offset_from_index(const char *str, int index)
|
|
|
|
|
{
|
|
|
|
|
int offset = 0, pos = 0;
|
|
|
|
|
while (pos != index) {
|
|
|
|
|
offset += BLI_str_utf8_size(str + offset);
|
|
|
|
|
pos++;
|
|
|
|
|
}
|
|
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int BLI_str_utf8_offset_to_column(const char *str, int offset)
|
|
|
|
|
{
|
|
|
|
|
int column = 0, pos = 0;
|
|
|
|
|
while (pos < offset) {
|
|
|
|
|
column += BLI_str_utf8_char_width_safe(str + pos);
|
|
|
|
|
pos += BLI_str_utf8_size_safe(str + pos);
|
|
|
|
|
}
|
|
|
|
|
return column;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int BLI_str_utf8_offset_from_column(const char *str, int column)
|
|
|
|
|
{
|
|
|
|
|
int offset = 0, pos = 0, col;
|
|
|
|
|
while (*(str + offset) && pos < column) {
|
|
|
|
|
col = BLI_str_utf8_char_width_safe(str + offset);
|
|
|
|
|
if (pos + col > column) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
offset += BLI_str_utf8_size_safe(str + offset);
|
|
|
|
|
pos += col;
|
|
|
|
|
}
|
|
|
|
|
return offset;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/** \} */
|