The attached patch fixes the bug I reported earlier today that spams Traditional Inline PGP 2.X with U+FFFD (as replacement for an invalid UTF-8 sequence) where Latin 1 national characters had been. It works like this: Rather than assuming the PGP output file is in UTF-8 format, 1. check if it really is valid UTF-8 2a. if it is, behavior is unchanged 2b. if it is NOT UTF-8, assume PGP output is in body_charset, as suggested by Tamotsu Takahashi. TEST STATUS: works for me. -- Matthias Andree
Index: Makefile.am =================================================================== RCS file: /home/roessler/cvs/mutt/Makefile.am,v retrieving revision 3.29 diff -u -r3.29 Makefile.am --- Makefile.am 4 Feb 2005 16:54:13 -0000 3.29 +++ Makefile.am 10 Feb 2005 17:37:52 -0000 @@ -64,7 +64,8 @@ browser.h mbyte.h remailer.h url.h mutt_ssl_nss.c \ crypt-mod-pgp-classic.c crypt-mod-smime-classic.c \ pgppacket.c mutt_idna.h hcache.c mutt_ssl_gnutls.c \ - crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c + crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c \ + utf8chk.h utf8chk.c EXTRA_DIST = COPYRIGHT GPL OPS OPS.PGP OPS.CRYPT OPS.SMIME TODO \ configure acconfig.h account.h \ Index: configure.in =================================================================== RCS file: /home/roessler/cvs/mutt/configure.in,v retrieving revision 3.21 diff -u -r3.21 configure.in --- configure.in 31 Jan 2005 02:40:14 -0000 3.21 +++ configure.in 10 Feb 2005 17:37:53 -0000 @@ -130,7 +130,7 @@ AC_DEFINE(CRYPT_BACKEND_CLASSIC_PGP,1, [ Define if you want classic PGP support. ]) PGPAUX_TARGET="pgpring pgpewrap" - MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o pgpinvoke.o pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o crypt-mod-pgp-classic.o" + MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o utf8chk.o pgpinvoke.o pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o crypt-mod-pgp-classic.o" fi AC_ARG_ENABLE(smime, [ --disable-smime Disable SMIME support], Index: pgp.c =================================================================== RCS file: /home/roessler/cvs/mutt/pgp.c,v retrieving revision 3.39 diff -u -r3.39 pgp.c --- pgp.c 3 Feb 2005 18:44:27 -0000 3.39 +++ pgp.c 10 Feb 2005 17:37:53 -0000 @@ -2,6 +2,7 @@ * Copyright (C) 1996,1997 Michael R. Elkins <me@xxxxxxxx> * Copyright (C) 1998,1999 Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxx> * Copyright (C) 2004 g10 Code GmbH + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -35,6 +36,7 @@ #include "pgp.h" #include "mime.h" #include "copy.h" +#include "utf8chk.h" #include <sys/wait.h> #include <string.h> @@ -407,10 +409,29 @@ else if (pgpout) { FGETCONV *fc; - int c; - rewind (pgpout); + int c, valid_utf8 = 1; + + { + struct is_valid_utf8_state u8s; + + /* check if file is UTF-8, if it isn't, assume body character set */ + is_valid_utf8_init(&u8s); + rewind (pgpout); + do { + c = fgetc(pgpout); + if (!is_valid_utf8(&u8s, c)) { + valid_utf8 = 0; + break; + } + } while (c != EOF); + } + + dprint (1, (debugfile, "pgp.c:%d: pgpout is %s UTF-8, body_charset=\"%s\"\n", __LINE__, valid_utf8 ? "conformant" : "not", body_charset)); + + /* now decode */ + rewind(pgpout); state_set_prefix (s); - fc = fgetconv_open (pgpout, "utf-8", Charset, 0); + fc = fgetconv_open (pgpout, valid_utf8 ? "utf-8" : body_charset, Charset, 0); while ((c = fgetconv (fc)) != EOF) state_prefix_putc (c, s); fgetconv_close (&fc); --- /dev/null 2004-10-02 10:38:03.000000000 +0200 +++ utf8chk.h 2005-02-10 18:11:28.000000000 +0100 @@ -0,0 +1,31 @@ +/* utf8chk.h -- fast ANSI-C UTF-8 validator + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef UTF8CHK_H +#define UTF8CHK_H 1 + +struct is_valid_utf8_state { + unsigned long out; + int count; + int icount; +}; + +extern void is_valid_utf8_init(struct is_valid_utf8_state *s); +extern int is_valid_utf8(struct is_valid_utf8_state *s, int in); + +#endif --- /dev/null 2004-10-02 10:38:03.000000000 +0200 +++ utf8chk.c 2005-02-10 18:11:47.000000000 +0100 @@ -0,0 +1,99 @@ +/* utf8chk.c -- fast ANSI-C UTF-8 validator + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <stdio.h> +#include "utf8chk.h" + +void is_valid_utf8_init(struct is_valid_utf8_state *s) { + s->out = 0; + s->count = s->icount = 0; +} + +int is_valid_utf8(struct is_valid_utf8_state *s, int in) { + unsigned char c = (unsigned char)in; + + if (in == EOF) + return s->icount == 0; + + if (s->icount == 0) { + if (c < 0x80) + return 1; + if (c >= 0xfe) + return 0; + /* perhaps add s->count = 0; but for now assume + * that _init is called first */ + if (c >= 0xfc) + s->count = 5, s->out = c & 0x1; + else if (c >= 0xf8) + s->count = 4, s->out = c & 0x3; + else if (c >= 0xf0) + s->count = 3, s->out = c & 0x7; + else if (c >= 0xe0) + s->count = 2, s->out = c & 0x0f; + /* c2 rather than c0 catches overlong sequences right away */ + else if (c >= 0xc2) + s->count = 1, s->out = c & 0x1f; + s->icount = s->count; + return s->count != 0; + } else { + s->out <<= 6; + if (c < 0x80 || c >= 0xc0) + return 0; + } + s->out |= (c & 0x3f); + if (-- s->icount) + return 1; + + if (s->out == 0xfffe || s->out == 0xffff) + return 0; + if (s->out >= 0xd800 && s->out < 0xe000) + return 0; + /* note this "overlong sequence" check does not detect + * 0x40...0x7f for 2-byte sequences, hence check that separately + * - luckily these invalid 2-byte sequences have 0xc0 or 0xc1 as + * their first byte */ + if (s->out < (1u << (5 * s->count + 1))) + return 0; + return 1; +} + +#ifdef TEST +#include <stdlib.h> +int main(int argc, char **argv) { + int x; + struct is_valid_utf8_state s; + unsigned long count = 0; + + is_valid_utf8_init(&s); + + while (1) { + x = fgetc(stdin); + if (!is_valid_utf8(&s, x)) { + printf("BAD character at position %lu\n", count); + exit(EXIT_FAILURE); + } + + if (x == EOF) { + printf("OK\n"); + break; + } + count ++; + } + return 0; +} +#endif
Attachment:
pgpgb01OYljNi.pgp
Description: PGP signature