This repository has been archived on 2023-10-09. You can view files and clone it, but cannot push or open issues or pull requests.
Files
blender-archive/source/blender/blenlib/intern/string.c
Campbell Barton f28b5e672e python/utf8 compatibility fixes. (as discussed on the mailing list)
- user input gets non utf8 chars stripped all text input other then file paths.

- python has the same limitations, it will raise an error on non utf8 strings except for paths use unicode escape literals so its possible to deal with saving to these file paths from python.

- new string functions
  BLI_utf8_invalid_byte(str, len) returns the first invalid utf8 byte or -1 on on success.
  BLI_utf8_invalid_strip(str, len) strips non utf-8 chars.
2010-08-28 12:34:22 +00:00

462 lines
11 KiB
C

/* util.c
*
* various string, file, list operations.
*
*
* $Id$
*
* ***** BEGIN GPL LICENSE BLOCK *****
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version 2
* of the License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
*
* The Original Code is Copyright (C) 2001-2002 by NaN Holding BV.
* All rights reserved.
*
* The Original Code is: all of this file.
*
* Contributor(s): none yet.
*
* ***** END GPL LICENSE BLOCK *****
*
*/
#include <string.h>
#include <stdlib.h>
#include <stdarg.h>
#include <ctype.h>
#include "MEM_guardedalloc.h"
#include "BLI_dynstr.h"
#include "BLI_string.h"
char *BLI_strdupn(const char *str, int len) {
char *n= MEM_mallocN(len+1, "strdup");
memcpy(n, str, len);
n[len]= '\0';
return n;
}
char *BLI_strdup(const char *str) {
return BLI_strdupn(str, strlen(str));
}
char *BLI_strdupcat(const char *str1, const char *str2)
{
int len;
char *n;
len= strlen(str1)+strlen(str2);
n= MEM_mallocN(len+1, "strdupcat");
strcpy(n, str1);
strcat(n, str2);
return n;
}
char *BLI_strncpy(char *dst, const char *src, int maxncpy) {
int srclen= strlen(src);
int cpylen= (srclen>(maxncpy-1))?(maxncpy-1):srclen;
memcpy(dst, src, cpylen);
dst[cpylen]= '\0';
return dst;
}
int BLI_snprintf(char *buffer, size_t count, const char *format, ...)
{
int n;
va_list arg;
va_start(arg, format);
n = vsnprintf(buffer, count, format, arg);
if (n != -1 && n < count) {
buffer[n] = '\0';
} else {
buffer[count-1] = '\0';
}
va_end(arg);
return n;
}
char *BLI_sprintfN(const char *format, ...)
{
DynStr *ds;
va_list arg;
char *n;
va_start(arg, format);
ds= BLI_dynstr_new();
BLI_dynstr_vappendf(ds, format, arg);
n= BLI_dynstr_get_cstring(ds);
BLI_dynstr_free(ds);
va_end(arg);
return n;
}
/* Makes a copy of the text within the "" that appear after some text 'blahblah'
* i.e. for string 'pose["apples"]' with prefix 'pose[', it should grab "apples"
*
* - str: is the entire string to chop
* - prefix: is the part of the string to leave out
*
* Assume that the strings returned must be freed afterwards, and that the inputs will contain
* data we want...
*/
char *BLI_getQuotedStr (const char *str, const char *prefix)
{
int prefixLen = strlen(prefix);
char *startMatch, *endMatch;
/* get the starting point (i.e. where prefix starts, and add prefixLen+1 to it to get be after the first " */
startMatch= strstr(str, prefix) + prefixLen + 1;
/* get the end point (i.e. where the next occurance of " is after the starting point) */
endMatch= strchr(startMatch, '"'); // " NOTE: this comment here is just so that my text editor still shows the functions ok...
/* return the slice indicated */
return BLI_strdupn(startMatch, (int)(endMatch-startMatch));
}
/* Replaces all occurances of oldText with newText in str, returning a new string that doesn't
* contain the 'replaced' occurances.
*/
// A rather wasteful string-replacement utility, though this shall do for now...
// Feel free to replace this with an even safe + nicer alternative
char *BLI_replacestr(char *str, const char *oldText, const char *newText)
{
DynStr *ds= NULL;
int lenOld= strlen(oldText);
char *match;
/* sanity checks */
if ((str == NULL) || (str[0]==0))
return NULL;
else if ((oldText == NULL) || (newText == NULL) || (oldText[0]==0))
return BLI_strdup(str);
/* while we can still find a match for the old substring that we're searching for,
* keep dicing and replacing
*/
while ( (match = strstr(str, oldText)) ) {
/* the assembly buffer only gets created when we actually need to rebuild the string */
if (ds == NULL)
ds= BLI_dynstr_new();
/* if the match position does not match the current position in the string,
* copy the text up to this position and advance the current position in the string
*/
if (str != match) {
/* replace the token at the 'match' position with \0 so that the copied string will be ok,
* add the segment of the string from str to match to the buffer, then restore the value at match
*/
match[0]= 0;
BLI_dynstr_append(ds, str);
match[0]= oldText[0];
/* now our current position should be set on the start of the match */
str= match;
}
/* add the replacement text to the accumulation buffer */
BLI_dynstr_append(ds, newText);
/* advance the current position of the string up to the end of the replaced segment */
str += lenOld;
}
/* finish off and return a new string that has had all occurances of */
if (ds) {
char *newStr;
/* add what's left of the string to the assembly buffer
* - we've been adjusting str to point at the end of the replaced segments
*/
if (str != NULL)
BLI_dynstr_append(ds, str);
/* convert to new c-string (MEM_malloc'd), and free the buffer */
newStr= BLI_dynstr_get_cstring(ds);
BLI_dynstr_free(ds);
return newStr;
}
else {
/* just create a new copy of the entire string - we avoid going through the assembly buffer
* for what should be a bit more efficiency...
*/
return BLI_strdup(str);
}
}
int BLI_streq(const char *a, const char *b)
{
return (strcmp(a, b)==0);
}
int BLI_strcaseeq(const char *a, const char *b)
{
return (BLI_strcasecmp(a, b)==0);
}
/* strcasestr not available in MSVC */
char *BLI_strcasestr(const char *s, const char *find)
{
register char c, sc;
register size_t len;
if ((c = *find++) != 0) {
c= tolower(c);
len = strlen(find);
do {
do {
if ((sc = *s++) == 0)
return (NULL);
sc= tolower(sc);
} while (sc != c);
} while (BLI_strncasecmp(s, find, len) != 0);
s--;
}
return ((char *) s);
}
int BLI_strcasecmp(const char *s1, const char *s2) {
int i;
for (i=0; ; i++) {
char c1 = tolower(s1[i]);
char c2 = tolower(s2[i]);
if (c1<c2) {
return -1;
} else if (c1>c2) {
return 1;
} else if (c1==0) {
break;
}
}
return 0;
}
int BLI_strncasecmp(const char *s1, const char *s2, int n) {
int i;
for (i=0; i<n; i++) {
char c1 = tolower(s1[i]);
char c2 = tolower(s2[i]);
if (c1<c2) {
return -1;
} else if (c1>c2) {
return 1;
} else if (c1==0) {
break;
}
}
return 0;
}
/* natural string compare, keeping numbers in order */
int BLI_natstrcmp(const char *s1, const char *s2)
{
int d1= 0, d2= 0;
/* if both chars are numeric, to a strtol().
then increase string deltas as long they are
numeric, else do a tolower and char compare */
while(1) {
char c1 = tolower(s1[d1]);
char c2 = tolower(s2[d2]);
if( isdigit(c1) && isdigit(c2) ) {
int val1, val2;
val1= (int)strtol(s1+d1, (char **)NULL, 10);
val2= (int)strtol(s2+d2, (char **)NULL, 10);
if (val1<val2) {
return -1;
} else if (val1>val2) {
return 1;
}
d1++;
while( isdigit(s1[d1]) )
d1++;
d2++;
while( isdigit(s2[d2]) )
d2++;
c1 = tolower(s1[d1]);
c2 = tolower(s2[d2]);
}
if (c1<c2) {
return -1;
} else if (c1>c2) {
return 1;
} else if (c1==0) {
break;
}
d1++;
d2++;
}
return 0;
}
void BLI_timestr(double _time, char *str)
{
/* format 00:00:00.00 (hr:min:sec) string has to be 12 long */
int hr= ( (int) _time) / (60*60);
int min= (((int) _time) / 60 ) % 60;
int sec= ( (int) (_time)) % 60;
int hun= ( (int) (_time * 100.0)) % 100;
if (hr) {
sprintf(str, "%.2d:%.2d:%.2d.%.2d",hr,min,sec,hun);
} else {
sprintf(str, "%.2d:%.2d.%.2d",min,sec,hun);
}
str[11]=0;
}
/* determine the length of a fixed-size string */
size_t BLI_strnlen(const char *str, size_t maxlen)
{
const char *end = memchr(str, '\0', maxlen);
return end ? (size_t) (end - str) : maxlen;
}
/* from libswish3, originally called u8_isvalid(),
* modified to return the index of the bad character (byte index not utf).
* http://svn.swish-e.org/libswish3/trunk/src/libswish3/utf8.c r3044 - campbell */
/* based on the valid_utf8 routine from the PCRE library by Philip Hazel
length is in bytes, since without knowing whether the string is valid
it's hard to know how many characters there are! */
static const char trailingBytesForUTF8[256] = {
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
};
int BLI_utf8_invalid_byte(const char *str, int length)
{
const unsigned char *p, *pend = (unsigned char*)str + length;
unsigned char c;
int ab;
for (p = (unsigned char*)str; p < pend; p++) {
c = *p;
if (c < 128)
continue;
if ((c & 0xc0) != 0xc0)
goto utf8_error;
ab = trailingBytesForUTF8[c];
if (length < ab)
goto utf8_error;
length -= ab;
p++;
/* Check top bits in the second byte */
if ((*p & 0xc0) != 0x80)
goto utf8_error;
/* Check for overlong sequences for each different length */
switch (ab) {
/* Check for xx00 000x */
case 1:
if ((c & 0x3e) == 0) goto utf8_error;
continue; /* We know there aren't any more bytes to check */
/* Check for 1110 0000, xx0x xxxx */
case 2:
if (c == 0xe0 && (*p & 0x20) == 0) goto utf8_error;
break;
/* Check for 1111 0000, xx00 xxxx */
case 3:
if (c == 0xf0 && (*p & 0x30) == 0) goto utf8_error;
break;
/* Check for 1111 1000, xx00 0xxx */
case 4:
if (c == 0xf8 && (*p & 0x38) == 0) goto utf8_error;
break;
/* Check for leading 0xfe or 0xff,
and then for 1111 1100, xx00 00xx */
case 5:
if (c == 0xfe || c == 0xff ||
(c == 0xfc && (*p & 0x3c) == 0)) goto utf8_error;
break;
}
/* Check for valid bytes after the 2nd, if any; all must start 10 */
while (--ab > 0) {
if ((*(p+1) & 0xc0) != 0x80) goto utf8_error;
p++; /* do this after so we get usable offset - campbell */
}
}
return -1;
utf8_error:
return (int)((char *)p - (char *)str) - 1;
}
int BLI_utf8_invalid_strip(char *str, int length)
{
int bad_char, tot= 0;
while((bad_char= BLI_utf8_invalid_byte(str, length)) != -1) {
str += bad_char;
length -= bad_char;
if(length == 0) {
/* last character bad, strip it */
*str= '\0';
tot++;
break;
}
else {
/* strip, keep looking */
memmove(str, str + 1, length);
tot++;
}
}
return tot;
}