'default_charset' configuration variable
Hello List,
attached are 2 patches. The first one introduces a new configuration
variable 'default_charset', and the second one contains two trivial
fixes I needed to properly compile/link mutt with ncursesw but
without NLS.
Background for the 'default_charset' patch:
Some of the mails I receive are encoded in 8-bit ISO-8859-1 without a
character set information. There are even 8-bit characters in the
header. I know, this violates the RFCs, but I do receive such mails.
As I run Mutt in an UTF-8 environment, these mails were not rendered
as expected (by the sender).
The new 'default_charset' configuration variable specifies a
character set used to decode header and body of mails if 8-bit
characters are found and no character set information is given in the
mail. If I set this to e.g. ISO-8859-1 or cp1252, the broken mails
get readable even on non-latin1-terminals.
If 'default_charset' is not set, this patch does not change Mutt's
original behavior.
The patches are made for Mutt 1.5.4, but they also work for 1.5.6.
Ludolf
PS:
Does anyone have a satisfying solution for Mutt on Windoze?
The only reason I don't use Mutt at work is, I can't find an UTF-8
(or at least latin1) 'DOS box' to run Mutt in.
--
---------------------------------------------------------------
Ludolf Holzheid Tel: +49 621 339960
Bihl+Wiedemann GmbH Fax: +49 621 3392239
Flosswoerthstrasse 41 e-mail: lholzheid@xxxxxxxxxxxxxxxxx
D-68199 Mannheim, Germany
---------------------------------------------------------------
diff -aur mutt-1.5.4-org/globals.h mutt-1.5.4/globals.h
--- mutt-1.5.4-org/globals.h 2003-03-07 09:19:41.000000000 +0100
+++ mutt-1.5.4/globals.h 2004-02-29 23:51:55.000000000 +0100
@@ -38,6 +38,7 @@
WHERE char *Charset;
WHERE char *ComposeFormat;
WHERE char *ContentType;
+WHERE char *DefaultCharset;
WHERE char *DefaultHook;
WHERE char *DateFmt;
WHERE char *DisplayFilter;
diff -aur mutt-1.5.4-org/init.h mutt-1.5.4/init.h
--- mutt-1.5.4-org/init.h 2003-03-04 10:28:12.000000000 +0100
+++ mutt-1.5.4/init.h 2004-07-11 13:53:14.000000000 +0200
@@ -374,6 +374,31 @@
** rest of the string are expanded in the \fIC\fP locale (that is in US
** English).
*/
+ { "default_charset", DT_STR, R_NONE, UL &DefaultCharset, UL 0 },
+ /*
+ ** .pp
+ ** This variable specifies a default character set to decode text in
+ ** mails (header and body) with, in case 8-bit characters are
+ ** encounterd and no code set information is given in the mail.
+ ** .pp
+ ** According to the RFCs, texts without character set specification
+ ** is to be regarded as 7-bit US-ASCII. This is how Mutt behaves if
+ ** \fIdefault_charset\fP is unset.
+ ** .pp
+ ** There are, however, mail user agents (MUAs), that do \fInot\fP
+ ** properly encode text in their native character set even if 8-bit
+ ** characters are used. If you receive mails created by such broken
+ ** MUAs, and ``$$charset'' does not match the MUA's character set,
+ ** you may want to set \fIdefault_charset\fP to a guess of the MUA's
+ ** native character set. In most cases, this guess would be
+ ** ``ISO-8859-1'', ``ISO-8859-16'', or even ``cp1252''.
+ ** .pp
+ ** Only character sets that are true 8-bit extensions of US-ASCII
+ ** are reasonable values for this variable.
+ ** .pp
+ ** When this variable is set interactively, the change doesn't have
+ ** the desired effect before you have changed folders.
+ */
{ "default_hook", DT_STR, R_NONE, UL &DefaultHook, UL "~f %s !~P | (~P
~C %s)" },
/*
** .pp
diff -aur mutt-1.5.4-org/parse.c mutt-1.5.4/parse.c
--- mutt-1.5.4-org/parse.c 2003-03-19 22:19:45.000000000 +0100
+++ mutt-1.5.4/parse.c 2004-07-10 17:37:51.000000000 +0200
@@ -379,7 +379,12 @@
if (ct->type == TYPETEXT)
{
if (!(pc = mutt_get_parameter ("charset", ct->parameter)))
- mutt_set_parameter ("charset", "us-ascii", &ct->parameter);
+ {
+ if (NULL == DefaultCharset)
+ mutt_set_parameter ("charset", "us-ascii", &ct->parameter);
+ else
+ mutt_set_parameter ("charset", DefaultCharset, &ct->parameter);
+ }
}
}
@@ -421,13 +426,19 @@
char *c;
char *line = safe_malloc (LONG_STRING);
size_t linelen = LONG_STRING;
-
+
p->hdr_offset = ftell(fp);
p->encoding = ENC7BIT; /* default from RFC1521 */
p->type = digest ? TYPEMESSAGE : TYPETEXT;
p->disposition = DISPINLINE;
+ if (NULL != DefaultCharset)
+ {
+ p->encoding = ENC8BIT;
+ mutt_set_parameter ("charset", DefaultCharset, &p->parameter);
+ }
+
while (*(line = read_rfc822_line (fp, line, &linelen)) != 0)
{
/* Find the value of the current header */
@@ -1282,6 +1293,13 @@
/* RFC 2183 says this is arbitrary */
hdr->content->disposition = DISPINLINE;
+
+ if (NULL != DefaultCharset)
+ {
+ hdr->content->encoding = ENC8BIT;
+ mutt_set_parameter ("charset", DefaultCharset,
+ &hdr->content->parameter);
+ }
}
}
diff -aur mutt-1.5.4-org/rfc2047.c mutt-1.5.4/rfc2047.c
--- mutt-1.5.4-org/rfc2047.c 2003-01-21 13:25:22.000000000 +0100
+++ mutt-1.5.4/rfc2047.c 2004-07-10 18:51:32.000000000 +0200
@@ -717,6 +717,8 @@
char *d0, *d;
const char *s = *pd;
size_t dlen;
+ register size_t i;
+ char *s0;
if (!s || !*s)
return;
@@ -729,8 +731,43 @@
if (!(p = find_encoded_word (s, &q)))
{
/* no encoded words */
- strncpy (d, s, dlen);
- d += dlen;
+
+ if (NULL != DefaultCharset)
+ {
+ /* search for unencoded 8-bit characters */
+
+ for (i = 0; i < dlen; i++)
+ if (('\0' == s [i]) || (0x00 != (0x80 & s [i])))
+ break;
+
+ if ('\0' != s [i])
+ {
+ /* There are unencoded 8 bit characters. Assume them to
+ be encoded in DefaultCharset and convert them to
+ Charset. */
+
+ s0 = safe_malloc (dlen +1);
+ strncpy (s0, s, dlen);
+ s0 [dlen] = '\0';
+ mutt_convert_string (&s0, DefaultCharset, Charset, 0);
+ strncpy (d, s0, dlen);
+ d += strlen (s0);
+ FREE (&s0);
+ }
+ else
+ {
+ /* No unencoded 8-bit characters found. Assume the text
+ to be US-ASCII. */
+
+ strncpy (d, s, dlen);
+ d += dlen;
+ }
+ }
+ else
+ {
+ strncpy (d, s, dlen);
+ d += dlen;
+ }
break;
}
@@ -742,7 +779,41 @@
{
if (n > dlen)
n = dlen;
- memcpy (d, s, n);
+
+ if (NULL != DefaultCharset)
+ {
+ /* search for unencoded 8-bit characters */
+
+ for (i = 0; i < dlen; i++)
+ if (('\0' == s [i]) || (0x00 != (0x80 & s [i])))
+ break;
+
+ if ('\0' != s [i])
+ {
+ /* There are unencoded 8 bit characters. Assume them
+ to be encoded in DefaultCharset and convert them to
+ Charset. */
+
+ s0 = safe_malloc (n +1);
+ strncpy (s0, s, n);
+ s0 [n] = '\0';
+ mutt_convert_string (&s0, DefaultCharset, Charset, 0);
+ n = strlen (s0);
+ memcpy (d, s0, n);
+ FREE (&s0);
+ }
+ else
+ {
+ /* No unencoded 8-bit characters found. Assume the
+ text to be US-ASCII. */
+
+ memcpy (d, s, n);
+ }
+ }
+ else
+ {
+ memcpy (d, s, n);
+ }
d += n;
dlen -= n;
}
@@ -766,10 +837,14 @@
{
while (a)
{
- if (a->personal && strstr (a->personal, "=?") != NULL)
+ if (a->personal
+ && ((NULL != DefaultCharset)
+ || (strstr (a->personal, "=?") != NULL)))
rfc2047_decode (&a->personal);
#ifdef EXACT_ADDRESS
- if (a->val && strstr (a->val, "=?") != NULL)
+ if (a->val
+ && ((NULL != DefaultCharset)
+ || (strstr (a->val, "=?") != NULL)))
rfc2047_decode (&a->val);
#endif
a = a->next;
diff -aur mutt-1.5.4-org/mbyte.c mutt-1.5.4/mbyte.c
--- mutt-1.5.4-org/mbyte.c 2003-01-21 13:25:21.000000000 +0100
+++ mutt-1.5.4/mbyte.c 2004-07-11 02:30:32.000000000 +0200
@@ -72,9 +72,11 @@
}
#endif
+#ifdef ENABLE_NLS
#ifdef HAVE_BIND_TEXTDOMAIN_CODESET
bind_textdomain_codeset(PACKAGE, buffer);
#endif
+#endif
}
#ifndef HAVE_WC_FUNCS
diff -aur mutt-1.5.4-org/wcwidth.c mutt-1.5.4/wcwidth.c
--- mutt-1.5.4-org/wcwidth.c 2003-01-21 13:25:22.000000000 +0100
+++ mutt-1.5.4/wcwidth.c 2004-07-11 01:18:55.000000000 +0200
@@ -9,9 +9,10 @@
/* Adapted for Mutt by Edmund Grimley Evans.
*/
+#include "mutt.h"
+
#ifndef HAVE_WC_FUNCS
-#include "mutt.h"
#include "mbyte.h"
#include <ctype.h>