udf: Add support for decoding UTF-16 characters

Add support to decode characters outside of Base Multilingual Plane of
UTF-16 encoded in CS0 charset of UDF.

Signed-off-by: Jan Kara <jack@suse.cz>
This commit is contained in:
Jan Kara 2018-04-16 18:46:26 +02:00
parent ef2e18f1fa
commit 8a0cdef161

View File

@ -36,25 +36,6 @@
#define SURROGATE_CHAR_BITS 10 #define SURROGATE_CHAR_BITS 10
#define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1) #define SURROGATE_CHAR_MASK ((1 << SURROGATE_CHAR_BITS) - 1)
static int udf_uni2char_utf8(wchar_t uni,
unsigned char *out,
int boundlen)
{
int u_len = 0;
if (boundlen <= 0)
return -ENAMETOOLONG;
u_len = utf32_to_utf8(uni, out, boundlen);
if (u_len < 0) {
if (uni > UNICODE_MAX ||
(uni & SURROGATE_MASK) == SURROGATE_PAIR)
return -EINVAL;
return -ENAMETOOLONG;
}
return u_len;
}
#define ILLEGAL_CHAR_MARK '_' #define ILLEGAL_CHAR_MARK '_'
#define EXT_MARK '.' #define EXT_MARK '.'
#define CRC_MARK '#' #define CRC_MARK '#'
@ -62,6 +43,50 @@ static int udf_uni2char_utf8(wchar_t uni,
/* Number of chars we need to store generated CRC to make filename unique */ /* Number of chars we need to store generated CRC to make filename unique */
#define CRC_LEN 5 #define CRC_LEN 5
static unicode_t get_utf16_char(const uint8_t *str_i, int str_i_max_len,
int str_i_idx, int u_ch, unicode_t *ret)
{
unicode_t c;
int start_idx = str_i_idx;
/* Expand OSTA compressed Unicode to Unicode */
c = str_i[str_i_idx++];
if (u_ch > 1)
c = (c << 8) | str_i[str_i_idx++];
if ((c & SURROGATE_MASK) == SURROGATE_PAIR) {
unicode_t next;
/* Trailing surrogate char */
if (str_i_idx >= str_i_max_len) {
c = UNICODE_MAX + 1;
goto out;
}
/* Low surrogate must follow the high one... */
if (c & SURROGATE_LOW) {
c = UNICODE_MAX + 1;
goto out;
}
WARN_ON_ONCE(u_ch != 2);
next = str_i[str_i_idx++] << 8;
next |= str_i[str_i_idx++];
if ((next & SURROGATE_MASK) != SURROGATE_PAIR ||
!(next & SURROGATE_LOW)) {
c = UNICODE_MAX + 1;
goto out;
}
c = PLANE_SIZE +
((c & SURROGATE_CHAR_MASK) << SURROGATE_CHAR_BITS) +
(next & SURROGATE_CHAR_MASK);
}
out:
*ret = c;
return str_i_idx - start_idx;
}
static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len, static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
int *str_o_idx, int *str_o_idx,
const uint8_t *str_i, int str_i_max_len, const uint8_t *str_i, int str_i_max_len,
@ -70,27 +95,29 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
int (*conv_f)(wchar_t, unsigned char *, int), int (*conv_f)(wchar_t, unsigned char *, int),
int translate) int translate)
{ {
uint32_t c; unicode_t c;
int illChar = 0; int illChar = 0;
int len, gotch = 0; int len, gotch = 0;
for (; (!gotch) && (*str_i_idx < str_i_max_len); *str_i_idx += u_ch) { while (!gotch && *str_i_idx < str_i_max_len) {
if (*str_o_idx >= str_o_max_len) { if (*str_o_idx >= str_o_max_len) {
*needsCRC = 1; *needsCRC = 1;
return gotch; return gotch;
} }
/* Expand OSTA compressed Unicode to Unicode */ len = get_utf16_char(str_i, str_i_max_len, *str_i_idx, u_ch,
c = str_i[*str_i_idx]; &c);
if (u_ch > 1) /* These chars cannot be converted. Replace them. */
c = (c << 8) | str_i[*str_i_idx + 1]; if (c == 0 || c > UNICODE_MAX || (conv_f && c > MAX_WCHAR_T) ||
(translate && c == '/')) {
if (translate && (c == '/' || c == 0))
illChar = 1; illChar = 1;
else if (illChar) if (!translate)
gotch = 1;
} else if (illChar)
break; break;
else else
gotch = 1; gotch = 1;
*str_i_idx += len;
} }
if (illChar) { if (illChar) {
*needsCRC = 1; *needsCRC = 1;
@ -98,7 +125,15 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
gotch = 1; gotch = 1;
} }
if (gotch) { if (gotch) {
len = conv_f(c, &str_o[*str_o_idx], str_o_max_len - *str_o_idx); if (conv_f) {
len = conv_f(c, &str_o[*str_o_idx],
str_o_max_len - *str_o_idx);
} else {
len = utf32_to_utf8(c, &str_o[*str_o_idx],
str_o_max_len - *str_o_idx);
if (len < 0)
len = -ENAMETOOLONG;
}
/* Valid character? */ /* Valid character? */
if (len >= 0) if (len >= 0)
*str_o_idx += len; *str_o_idx += len;
@ -106,7 +141,7 @@ static int udf_name_conv_char(uint8_t *str_o, int str_o_max_len,
*needsCRC = 1; *needsCRC = 1;
gotch = 0; gotch = 0;
} else { } else {
str_o[(*str_o_idx)++] = '?'; str_o[(*str_o_idx)++] = ILLEGAL_CHAR_MARK;
*needsCRC = 1; *needsCRC = 1;
} }
} }
@ -142,12 +177,10 @@ static int udf_name_from_CS0(struct super_block *sb,
return 0; return 0;
} }
if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP))
conv_f = udf_uni2char_utf8;
} else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) {
conv_f = UDF_SB(sb)->s_nls_map->uni2char; conv_f = UDF_SB(sb)->s_nls_map->uni2char;
} else else
BUG(); conv_f = NULL;
cmp_id = ocu[0]; cmp_id = ocu[0];
if (cmp_id != 8 && cmp_id != 16) { if (cmp_id != 8 && cmp_id != 16) {