Given the ongoing discussion, I'm not taking this into 1.5.8, or into the CVS at this point. On 2005-02-10 18:54:00 +0100, Matthias Andree wrote: > From: Matthias Andree <matthias.andree@xxxxxx> > To: mutt-dev@xxxxxxxx > Date: Thu, 10 Feb 2005 18:54:00 +0100 > Subject: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input > X-Spam-Level: > > The attached patch fixes the bug I reported earlier today that spams > Traditional Inline PGP 2.X with U+FFFD (as replacement for an invalid > UTF-8 sequence) where Latin 1 national characters had been. > > It works like this: > > Rather than assuming the PGP output file is in UTF-8 format, > 1. check if it really is valid UTF-8 > 2a. if it is, behavior is unchanged > 2b. if it is NOT UTF-8, assume PGP output is in body_charset, > as suggested by Tamotsu Takahashi. > > TEST STATUS: works for me. > > -- > Matthias Andree Content-Description: PGP fallback to body_charset for non-UTF-8 input > Index: Makefile.am > =================================================================== > RCS file: /home/roessler/cvs/mutt/Makefile.am,v > retrieving revision 3.29 > diff -u -r3.29 Makefile.am > --- Makefile.am 4 Feb 2005 16:54:13 -0000 3.29 > +++ Makefile.am 10 Feb 2005 17:37:52 -0000 > @@ -64,7 +64,8 @@ > browser.h mbyte.h remailer.h url.h mutt_ssl_nss.c \ > crypt-mod-pgp-classic.c crypt-mod-smime-classic.c \ > pgppacket.c mutt_idna.h hcache.c mutt_ssl_gnutls.c \ > - crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c > + crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c \ > + utf8chk.h utf8chk.c > > EXTRA_DIST = COPYRIGHT GPL OPS OPS.PGP OPS.CRYPT OPS.SMIME TODO \ > configure acconfig.h account.h \ > Index: configure.in > =================================================================== > RCS file: /home/roessler/cvs/mutt/configure.in,v > retrieving revision 3.21 > diff -u -r3.21 configure.in > --- configure.in 31 Jan 2005 02:40:14 -0000 3.21 > +++ configure.in 10 Feb 2005 17:37:53 -0000 > @@ -130,7 +130,7 @@ > AC_DEFINE(CRYPT_BACKEND_CLASSIC_PGP,1, > [ Define if you want classic PGP support. ]) > PGPAUX_TARGET="pgpring pgpewrap" > - MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o pgpinvoke.o > pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o > crypt-mod-pgp-classic.o" > + MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o utf8chk.o > pgpinvoke.o pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o > crypt-mod-pgp-classic.o" > fi > > AC_ARG_ENABLE(smime, [ --disable-smime Disable SMIME > support], > Index: pgp.c > =================================================================== > RCS file: /home/roessler/cvs/mutt/pgp.c,v > retrieving revision 3.39 > diff -u -r3.39 pgp.c > --- pgp.c 3 Feb 2005 18:44:27 -0000 3.39 > +++ pgp.c 10 Feb 2005 17:37:53 -0000 > @@ -2,6 +2,7 @@ > * Copyright (C) 1996,1997 Michael R. Elkins <me@xxxxxxxx> > * Copyright (C) 1998,1999 Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxx> > * Copyright (C) 2004 g10 Code GmbH > + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -35,6 +36,7 @@ > #include "pgp.h" > #include "mime.h" > #include "copy.h" > +#include "utf8chk.h" > > #include <sys/wait.h> > #include <string.h> > @@ -407,10 +409,29 @@ > else if (pgpout) > { > FGETCONV *fc; > - int c; > - rewind (pgpout); > + int c, valid_utf8 = 1; > + > + { > + struct is_valid_utf8_state u8s; > + > + /* check if file is UTF-8, if it isn't, assume body character set */ > + is_valid_utf8_init(&u8s); > + rewind (pgpout); > + do { > + c = fgetc(pgpout); > + if (!is_valid_utf8(&u8s, c)) { > + valid_utf8 = 0; > + break; > + } > + } while (c != EOF); > + } > + > + dprint (1, (debugfile, "pgp.c:%d: pgpout is %s UTF-8, > body_charset=\"%s\"\n", __LINE__, valid_utf8 ? "conformant" : "not", > body_charset)); > + > + /* now decode */ > + rewind(pgpout); > state_set_prefix (s); > - fc = fgetconv_open (pgpout, "utf-8", Charset, 0); > + fc = fgetconv_open (pgpout, valid_utf8 ? "utf-8" : body_charset, > Charset, 0); > while ((c = fgetconv (fc)) != EOF) > state_prefix_putc (c, s); > fgetconv_close (&fc); > --- /dev/null 2004-10-02 10:38:03.000000000 +0200 > +++ utf8chk.h 2005-02-10 18:11:28.000000000 +0100 > @@ -0,0 +1,31 @@ > +/* utf8chk.h -- fast ANSI-C UTF-8 validator > + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or (at > + * your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, but > + * WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#ifndef UTF8CHK_H > +#define UTF8CHK_H 1 > + > +struct is_valid_utf8_state { > + unsigned long out; > + int count; > + int icount; > +}; > + > +extern void is_valid_utf8_init(struct is_valid_utf8_state *s); > +extern int is_valid_utf8(struct is_valid_utf8_state *s, int in); > + > +#endif > --- /dev/null 2004-10-02 10:38:03.000000000 +0200 > +++ utf8chk.c 2005-02-10 18:11:47.000000000 +0100 > @@ -0,0 +1,99 @@ > +/* utf8chk.c -- fast ANSI-C UTF-8 validator > + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License as published by > + * the Free Software Foundation; either version 2 of the License, or (at > + * your option) any later version. > + * > + * This program is distributed in the hope that it will be useful, but > + * WITHOUT ANY WARRANTY; without even the implied warranty of > + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + * General Public License for more details. > + * > + * You should have received a copy of the GNU General Public License > + * along with this program; if not, write to the Free Software > + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. > + */ > + > +#include <stdio.h> > +#include "utf8chk.h" > + > +void is_valid_utf8_init(struct is_valid_utf8_state *s) { > + s->out = 0; > + s->count = s->icount = 0; > +} > + > +int is_valid_utf8(struct is_valid_utf8_state *s, int in) { > + unsigned char c = (unsigned char)in; > + > + if (in == EOF) > + return s->icount == 0; > + > + if (s->icount == 0) { > + if (c < 0x80) > + return 1; > + if (c >= 0xfe) > + return 0; > + /* perhaps add s->count = 0; but for now assume > + * that _init is called first */ > + if (c >= 0xfc) > + s->count = 5, s->out = c & 0x1; > + else if (c >= 0xf8) > + s->count = 4, s->out = c & 0x3; > + else if (c >= 0xf0) > + s->count = 3, s->out = c & 0x7; > + else if (c >= 0xe0) > + s->count = 2, s->out = c & 0x0f; > + /* c2 rather than c0 catches overlong sequences right away */ > + else if (c >= 0xc2) > + s->count = 1, s->out = c & 0x1f; > + s->icount = s->count; > + return s->count != 0; > + } else { > + s->out <<= 6; > + if (c < 0x80 || c >= 0xc0) > + return 0; > + } > + s->out |= (c & 0x3f); > + if (-- s->icount) > + return 1; > + > + if (s->out == 0xfffe || s->out == 0xffff) > + return 0; > + if (s->out >= 0xd800 && s->out < 0xe000) > + return 0; > + /* note this "overlong sequence" check does not detect > + * 0x40...0x7f for 2-byte sequences, hence check that separately > + * - luckily these invalid 2-byte sequences have 0xc0 or 0xc1 as > + * their first byte */ > + if (s->out < (1u << (5 * s->count + 1))) > + return 0; > + return 1; > +} > + > +#ifdef TEST > +#include <stdlib.h> > +int main(int argc, char **argv) { > + int x; > + struct is_valid_utf8_state s; > + unsigned long count = 0; > + > + is_valid_utf8_init(&s); > + > + while (1) { > + x = fgetc(stdin); > + if (!is_valid_utf8(&s, x)) { > + printf("BAD character at position %lu\n", count); > + exit(EXIT_FAILURE); > + } > + > + if (x == EOF) { > + printf("OK\n"); > + break; > + } > + count ++; > + } > + return 0; > +} > +#endif -- Thomas Roessler · Personal soap box at <http://log.does-not-exist.org/>.
Attachment:
pgpVUFyHX8SBC.pgp
Description: PGP signature