Given the ongoing discussion, I'm not taking this into 1.5.8, or
into the CVS at this point.
On 2005-02-10 18:54:00 +0100, Matthias Andree wrote:
> From: Matthias Andree <matthias.andree@xxxxxx>
> To: mutt-dev@xxxxxxxx
> Date: Thu, 10 Feb 2005 18:54:00 +0100
> Subject: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
> X-Spam-Level:
>
> The attached patch fixes the bug I reported earlier today that spams
> Traditional Inline PGP 2.X with U+FFFD (as replacement for an invalid
> UTF-8 sequence) where Latin 1 national characters had been.
>
> It works like this:
>
> Rather than assuming the PGP output file is in UTF-8 format,
> 1. check if it really is valid UTF-8
> 2a. if it is, behavior is unchanged
> 2b. if it is NOT UTF-8, assume PGP output is in body_charset,
> as suggested by Tamotsu Takahashi.
>
> TEST STATUS: works for me.
>
> --
> Matthias Andree
Content-Description: PGP fallback to body_charset for non-UTF-8 input
> Index: Makefile.am
> ===================================================================
> RCS file: /home/roessler/cvs/mutt/Makefile.am,v
> retrieving revision 3.29
> diff -u -r3.29 Makefile.am
> --- Makefile.am 4 Feb 2005 16:54:13 -0000 3.29
> +++ Makefile.am 10 Feb 2005 17:37:52 -0000
> @@ -64,7 +64,8 @@
> browser.h mbyte.h remailer.h url.h mutt_ssl_nss.c \
> crypt-mod-pgp-classic.c crypt-mod-smime-classic.c \
> pgppacket.c mutt_idna.h hcache.c mutt_ssl_gnutls.c \
> - crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c
> + crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c \
> + utf8chk.h utf8chk.c
>
> EXTRA_DIST = COPYRIGHT GPL OPS OPS.PGP OPS.CRYPT OPS.SMIME TODO \
> configure acconfig.h account.h \
> Index: configure.in
> ===================================================================
> RCS file: /home/roessler/cvs/mutt/configure.in,v
> retrieving revision 3.21
> diff -u -r3.21 configure.in
> --- configure.in 31 Jan 2005 02:40:14 -0000 3.21
> +++ configure.in 10 Feb 2005 17:37:53 -0000
> @@ -130,7 +130,7 @@
> AC_DEFINE(CRYPT_BACKEND_CLASSIC_PGP,1,
> [ Define if you want classic PGP support. ])
> PGPAUX_TARGET="pgpring pgpewrap"
> - MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o pgpinvoke.o
> pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o
> crypt-mod-pgp-classic.o"
> + MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o utf8chk.o
> pgpinvoke.o pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o
> crypt-mod-pgp-classic.o"
> fi
>
> AC_ARG_ENABLE(smime, [ --disable-smime Disable SMIME
> support],
> Index: pgp.c
> ===================================================================
> RCS file: /home/roessler/cvs/mutt/pgp.c,v
> retrieving revision 3.39
> diff -u -r3.39 pgp.c
> --- pgp.c 3 Feb 2005 18:44:27 -0000 3.39
> +++ pgp.c 10 Feb 2005 17:37:53 -0000
> @@ -2,6 +2,7 @@
> * Copyright (C) 1996,1997 Michael R. Elkins <me@xxxxxxxx>
> * Copyright (C) 1998,1999 Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxx>
> * Copyright (C) 2004 g10 Code GmbH
> + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx>
> *
> * This program is free software; you can redistribute it and/or modify
> * it under the terms of the GNU General Public License as published by
> @@ -35,6 +36,7 @@
> #include "pgp.h"
> #include "mime.h"
> #include "copy.h"
> +#include "utf8chk.h"
>
> #include <sys/wait.h>
> #include <string.h>
> @@ -407,10 +409,29 @@
> else if (pgpout)
> {
> FGETCONV *fc;
> - int c;
> - rewind (pgpout);
> + int c, valid_utf8 = 1;
> +
> + {
> + struct is_valid_utf8_state u8s;
> +
> + /* check if file is UTF-8, if it isn't, assume body character set */
> + is_valid_utf8_init(&u8s);
> + rewind (pgpout);
> + do {
> + c = fgetc(pgpout);
> + if (!is_valid_utf8(&u8s, c)) {
> + valid_utf8 = 0;
> + break;
> + }
> + } while (c != EOF);
> + }
> +
> + dprint (1, (debugfile, "pgp.c:%d: pgpout is %s UTF-8,
> body_charset=\"%s\"\n", __LINE__, valid_utf8 ? "conformant" : "not",
> body_charset));
> +
> + /* now decode */
> + rewind(pgpout);
> state_set_prefix (s);
> - fc = fgetconv_open (pgpout, "utf-8", Charset, 0);
> + fc = fgetconv_open (pgpout, valid_utf8 ? "utf-8" : body_charset,
> Charset, 0);
> while ((c = fgetconv (fc)) != EOF)
> state_prefix_putc (c, s);
> fgetconv_close (&fc);
> --- /dev/null 2004-10-02 10:38:03.000000000 +0200
> +++ utf8chk.h 2005-02-10 18:11:28.000000000 +0100
> @@ -0,0 +1,31 @@
> +/* utf8chk.h -- fast ANSI-C UTF-8 validator
> + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or (at
> + * your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#ifndef UTF8CHK_H
> +#define UTF8CHK_H 1
> +
> +struct is_valid_utf8_state {
> + unsigned long out;
> + int count;
> + int icount;
> +};
> +
> +extern void is_valid_utf8_init(struct is_valid_utf8_state *s);
> +extern int is_valid_utf8(struct is_valid_utf8_state *s, int in);
> +
> +#endif
> --- /dev/null 2004-10-02 10:38:03.000000000 +0200
> +++ utf8chk.c 2005-02-10 18:11:47.000000000 +0100
> @@ -0,0 +1,99 @@
> +/* utf8chk.c -- fast ANSI-C UTF-8 validator
> + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or (at
> + * your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#include <stdio.h>
> +#include "utf8chk.h"
> +
> +void is_valid_utf8_init(struct is_valid_utf8_state *s) {
> + s->out = 0;
> + s->count = s->icount = 0;
> +}
> +
> +int is_valid_utf8(struct is_valid_utf8_state *s, int in) {
> + unsigned char c = (unsigned char)in;
> +
> + if (in == EOF)
> + return s->icount == 0;
> +
> + if (s->icount == 0) {
> + if (c < 0x80)
> + return 1;
> + if (c >= 0xfe)
> + return 0;
> + /* perhaps add s->count = 0; but for now assume
> + * that _init is called first */
> + if (c >= 0xfc)
> + s->count = 5, s->out = c & 0x1;
> + else if (c >= 0xf8)
> + s->count = 4, s->out = c & 0x3;
> + else if (c >= 0xf0)
> + s->count = 3, s->out = c & 0x7;
> + else if (c >= 0xe0)
> + s->count = 2, s->out = c & 0x0f;
> + /* c2 rather than c0 catches overlong sequences right away */
> + else if (c >= 0xc2)
> + s->count = 1, s->out = c & 0x1f;
> + s->icount = s->count;
> + return s->count != 0;
> + } else {
> + s->out <<= 6;
> + if (c < 0x80 || c >= 0xc0)
> + return 0;
> + }
> + s->out |= (c & 0x3f);
> + if (-- s->icount)
> + return 1;
> +
> + if (s->out == 0xfffe || s->out == 0xffff)
> + return 0;
> + if (s->out >= 0xd800 && s->out < 0xe000)
> + return 0;
> + /* note this "overlong sequence" check does not detect
> + * 0x40...0x7f for 2-byte sequences, hence check that separately
> + * - luckily these invalid 2-byte sequences have 0xc0 or 0xc1 as
> + * their first byte */
> + if (s->out < (1u << (5 * s->count + 1)))
> + return 0;
> + return 1;
> +}
> +
> +#ifdef TEST
> +#include <stdlib.h>
> +int main(int argc, char **argv) {
> + int x;
> + struct is_valid_utf8_state s;
> + unsigned long count = 0;
> +
> + is_valid_utf8_init(&s);
> +
> + while (1) {
> + x = fgetc(stdin);
> + if (!is_valid_utf8(&s, x)) {
> + printf("BAD character at position %lu\n", count);
> + exit(EXIT_FAILURE);
> + }
> +
> + if (x == EOF) {
> + printf("OK\n");
> + break;
> + }
> + count ++;
> + }
> + return 0;
> +}
> +#endif
--
Thomas Roessler · Personal soap box at <http://log.does-not-exist.org/>.
Attachment:
pgpVUFyHX8SBC.pgp
Description: PGP signature