Skip to content

Commit 6d75431

Browse files
author
Charlie Gordon
committed
Improve charset detection and handling
* add charset_raw for binary files * improve charset detection for ambiguous cases * add do_show_coding_system() * add do_set_auto_coding() to (re)select the best coding system * handle BOM mark: display as \ufeff and ignore for syntax coloring
1 parent f765f85 commit 6d75431

File tree

6 files changed

+113
-11
lines changed

6 files changed

+113
-11
lines changed

buffer.c

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,9 @@ EditBuffer *eb_new(const char *name, int flags)
498498

499499
if (flags & BF_UTF8) {
500500
eb_set_charset(b, &charset_utf8, b->eol_type);
501+
} else
502+
if (flags & BF_RAW) {
503+
eb_set_charset(b, &charset_raw, EOL_UNIX);
501504
} else {
502505
/* CG: default charset should be selectable */
503506
eb_set_charset(b, &charset_8859_1, b->eol_type);

charset.c

Lines changed: 67 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,37 @@ static const unsigned char utf8_first_code_mask[7] = {
9898
0, 0, 0x1f, 0xf, 0x7, 0x3, 0x1,
9999
};
100100

101+
/********************************************************/
102+
/* raw */
103+
104+
static void decode_raw_init(CharsetDecodeState *s)
105+
{
106+
s->table = table_idem;
107+
}
108+
109+
static u8 *encode_raw(__unused__ QECharset *charset, u8 *p, int c)
110+
{
111+
if (c <= 0xff) {
112+
*p++ = c;
113+
return p;
114+
} else {
115+
return NULL;
116+
}
117+
}
118+
119+
QECharset charset_raw = {
120+
"raw",
121+
"binary|none",
122+
decode_raw_init,
123+
decode_8bit,
124+
encode_raw,
125+
charset_get_pos_8bit,
126+
charset_get_chars_8bit,
127+
charset_goto_char_8bit,
128+
charset_goto_line_8bit,
129+
1, 0, 0, 10, 0, 0, NULL, NULL,
130+
};
131+
101132
/********************************************************/
102133
/* 8859-1 */
103134

@@ -1072,7 +1103,7 @@ void detect_eol_type_32bit(const u8 *buf, int size,
10721103
/* detect the charset. Actually only UTF8 is detected */
10731104
QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
10741105
{
1075-
int i, l, c, has_utf8;
1106+
int i, l, c, has_utf8, has_binary;
10761107

10771108
has_utf8 = 0;
10781109
for (i = 0; i < size;) {
@@ -1145,10 +1176,41 @@ QECharset *detect_charset(const u8 *buf, int size, EOLType *eol_typep)
11451176
#endif
11461177
/* Should detect iso-2220-jp upon \033$@ and \033$B, but jis
11471178
* support is not selected in tiny build
1179+
* XXX: should use charset probe functions.
11481180
*/
1149-
/* CG: should use a state variable for default charset */
1150-
detect_eol_type_8bit(buf, size, &charset_8859_1, eol_typep);
1151-
return &charset_8859_1;
1181+
1182+
has_binary = 0;
1183+
{
1184+
static const uint32_t magic = (1 << '\b') | (1 << '\t') | (1 << '\f') |
1185+
(1 << '\n') | (1 << '\r') | (1 << '\033') |
1186+
(1 << 0x0e) | (1 << 0x0f) | (1 << 0x1f);
1187+
1188+
for (i = 0; i < size; i++) {
1189+
c = buf[i];
1190+
if (c < 32 && !(magic & (1 << c)))
1191+
has_binary += 1;
1192+
}
1193+
}
1194+
if (has_binary) {
1195+
*eol_typep = EOL_UNIX;
1196+
return &charset_raw;
1197+
}
1198+
1199+
detect_eol_type_8bit(buf, size, &charset_raw, eol_typep);
1200+
1201+
if (*eol_typep == EOL_DOS) {
1202+
/* XXX: default DOS files to Latin1, should be selectable */
1203+
return &charset_8859_1;
1204+
}
1205+
#ifndef CONFIG_TINY
1206+
if (*eol_typep == EOL_MAC) {
1207+
/* XXX: default MAC files to Mac_roman, should be selectable */
1208+
/* XXX: should use probe functions */
1209+
return &charset_mac_roman;
1210+
}
1211+
#endif
1212+
/* XXX: should use a state variable for default charset */
1213+
return &charset_utf8;
11521214
}
11531215

11541216
/********************************************************/
@@ -1347,6 +1409,7 @@ void charset_init(void)
13471409
for (i = 0xc0; i < 0xfe; i++)
13481410
table_utf8[i] = ESCAPE_CHAR;
13491411

1412+
qe_register_charset(&charset_raw);
13501413
qe_register_charset(&charset_8859_1);
13511414
qe_register_charset(&charset_vt100);
13521415
qe_register_charset(&charset_7bit);

charsetmore.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1159,7 +1159,7 @@ static const unsigned short table_mac_roman[128] = {
11591159
0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
11601160
};
11611161

1162-
static QECharset charset_mac_roman = {
1162+
QECharset charset_mac_roman = {
11631163
"mac-roman",
11641164
"x-mac|mac",
11651165
decode_8bit_init,

qe.c

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1875,6 +1875,31 @@ QECharset *read_charset(EditState *s, const char *charset_str,
18751875
return charset;
18761876
}
18771877

1878+
void do_show_coding_system(EditState *s)
1879+
{
1880+
put_status(s, "Buffer charset is now %s%s", s->b->charset->name,
1881+
s->b->eol_type == EOL_DOS ? "-dos" :
1882+
s->b->eol_type == EOL_MAC ? "-mac" : "-unix");
1883+
}
1884+
1885+
void do_set_auto_coding(EditState *s, int verbose)
1886+
{
1887+
u8 buf[4097];
1888+
int buf_size;
1889+
EditBuffer *b = s->b;
1890+
EOLType eol_type = b->eol_type;
1891+
QECharset *charset;
1892+
1893+
buf_size = eb_read(b, 0, buf, sizeof(buf));
1894+
eol_type = b->eol_type;
1895+
/* XXX: detect_charset returns a default charset */
1896+
charset = detect_charset(buf, buf_size, &eol_type);
1897+
eb_set_charset(b, charset, eol_type);
1898+
if (verbose) {
1899+
do_show_coding_system(s);
1900+
}
1901+
}
1902+
18781903
void do_set_buffer_file_coding_system(EditState *s, const char *charset_str)
18791904
{
18801905
QECharset *charset;
@@ -1885,7 +1910,7 @@ void do_set_buffer_file_coding_system(EditState *s, const char *charset_str)
18851910
if (!charset)
18861911
return;
18871912
eb_set_charset(s->b, charset, eol_type);
1888-
put_status(s, "Charset is now %s for this buffer", s->b->charset->name);
1913+
do_show_coding_system(s);
18891914
}
18901915

18911916
/* convert the charset of a buffer to another charset */
@@ -3242,7 +3267,7 @@ static int bidir_compute_attributes(TypeLink *list_tab, int max_size,
32423267
int generic_get_colorized_line(EditState *s, unsigned int *buf, int buf_size,
32433268
int *offsetp, int line_num)
32443269
{
3245-
int len, l, line, col, offset;
3270+
int len, l, line, col, offset, bom;
32463271
int colorize_state;
32473272

32483273
/* invalidate cache if needed */
@@ -3272,15 +3297,17 @@ int generic_get_colorized_line(EditState *s, unsigned int *buf, int buf_size,
32723297

32733298
for (l = s->colorize_nb_valid_lines; l <= line_num; l++) {
32743299
len = eb_get_line(s->b, buf, buf_size, &offset);
3275-
s->colorize_func(buf, len, &colorize_state, 1);
3300+
bom = (len > 0 && buf[0] == 0xFEFF);
3301+
s->colorize_func(buf + bom, len - bom, &colorize_state, 1);
32763302
s->colorize_states[l] = colorize_state;
32773303
}
32783304
}
32793305

32803306
/* compute line color */
32813307
colorize_state = s->colorize_states[line_num];
32823308
len = eb_get_line(s->b, buf, buf_size, offsetp);
3283-
s->colorize_func(buf, len, &colorize_state, 0);
3309+
bom = (len > 0 && buf[0] == 0xFEFF);
3310+
s->colorize_func(buf + bom, len - bom, &colorize_state, 0);
32843311

32853312
/* XXX: if state is same as previous, minimize invalid region? */
32863313
s->colorize_states[line_num + 1] = colorize_state;
@@ -3504,7 +3531,8 @@ int text_display(EditState *s, DisplayState *ds, int offset)
35043531
/* currently, we cannot display these chars */
35053532
display_printf(ds, offset0, offset, "\\U%08x", c);
35063533
} else
3507-
if (c >= 256 && s->qe_state->show_unicode == 1) {
3534+
if (c >= 256 && (s->qe_state->show_unicode == 1 || c == 0xfeff)) {
3535+
/* Display BOM as \uFEFF to make it explicit */
35083536
display_printf(ds, offset0, offset, "\\u%04x", c);
35093537
} else {
35103538
display_char_bidir(ds, offset0, offset, embedding_level, c);

qe.h

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,10 +506,14 @@ struct QECharset {
506506
};
507507

508508
extern QECharset *first_charset;
509-
extern QECharset charset_utf8, charset_8859_1; /* predefined charsets */
509+
/* predefined charsets */
510+
extern QECharset charset_raw;
511+
extern QECharset charset_8859_1;
512+
extern QECharset charset_utf8;
510513
extern QECharset charset_vt100; /* used for the tty output */
511514
extern QECharset charset_ucs2le, charset_ucs2be;
512515
extern QECharset charset_ucs4le, charset_ucs4be;
516+
extern QECharset charset_mac_roman;
513517

514518
typedef enum EOLType {
515519
EOL_UNIX = 0,
@@ -1760,6 +1764,8 @@ void do_yank_pop(EditState *s);
17601764
void do_exchange_point_and_mark(EditState *s);
17611765
QECharset *read_charset(EditState *s, const char *charset_str,
17621766
EOLType *eol_typep);
1767+
void do_show_coding_system(EditState *s);
1768+
void do_set_auto_coding(EditState *s, int verbose);
17631769
void do_set_buffer_file_coding_system(EditState *s, const char *charset_str);
17641770
void do_convert_buffer_file_coding_system(EditState *s,
17651771
const char *charset_str);

qeconfig.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,8 @@ static CmdDef basic_commands[] = {
381381
CMD2( KEY_NONE, KEY_NONE,
382382
"set-mode", do_set_mode, ESs,
383383
"s{Set mode: }[mode]")
384+
CMD1( KEY_NONE, KEY_NONE,
385+
"set-auto-coding", do_set_auto_coding, 1)
384386

385387
/* tab & indent */
386388
CMD2( KEY_NONE, KEY_NONE,

0 commit comments

Comments
 (0)