For 1.5.8: PGP fallback to body_charset for non-UTF-8 input

To: mutt-dev@xxxxxxxx
Subject: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
From: Matthias Andree <matthias.andree@xxxxxx>
Date: Thu, 10 Feb 2005 18:54:00 +0100
List-unsubscribe: <mailto:mutt-dev-request@mutt.org?body=unsubscribe>
Sender: owner-mutt-dev@xxxxxxxx
User-agent: Gnus/5.110003 (No Gnus v0.3) Emacs/21.3 (gnu/linux)

The attached patch fixes the bug I reported earlier today that spams
Traditional Inline PGP 2.X with U+FFFD (as replacement for an invalid
UTF-8 sequence) where Latin 1 national characters had been.

It works like this:

Rather than assuming the PGP output file is in UTF-8 format,
1. check if it really is valid UTF-8
2a. if it is, behavior is unchanged
2b. if it is NOT UTF-8, assume PGP output is in body_charset,
    as suggested by Tamotsu Takahashi.

TEST STATUS: works for me.

-- 
Matthias Andree

Index: Makefile.am
===================================================================
RCS file: /home/roessler/cvs/mutt/Makefile.am,v
retrieving revision 3.29
diff -u -r3.29 Makefile.am
--- Makefile.am 4 Feb 2005 16:54:13 -0000       3.29
+++ Makefile.am 10 Feb 2005 17:37:52 -0000
@@ -64,7 +64,8 @@
        browser.h mbyte.h remailer.h url.h mutt_ssl_nss.c \
        crypt-mod-pgp-classic.c crypt-mod-smime-classic.c \
        pgppacket.c mutt_idna.h hcache.c mutt_ssl_gnutls.c \
-       crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c
+       crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c \
+       utf8chk.h utf8chk.c
 
 EXTRA_DIST = COPYRIGHT GPL OPS OPS.PGP OPS.CRYPT OPS.SMIME TODO \
        configure acconfig.h account.h \
Index: configure.in
===================================================================
RCS file: /home/roessler/cvs/mutt/configure.in,v
retrieving revision 3.21
diff -u -r3.21 configure.in
--- configure.in        31 Jan 2005 02:40:14 -0000      3.21
+++ configure.in        10 Feb 2005 17:37:53 -0000
@@ -130,7 +130,7 @@
                 AC_DEFINE(CRYPT_BACKEND_CLASSIC_PGP,1,
                     [ Define if you want classic PGP support. ])
                 PGPAUX_TARGET="pgpring pgpewrap"
-                MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o pgpinvoke.o pgpkey.o 
pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o crypt-mod-pgp-classic.o"
+                MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o utf8chk.o 
pgpinvoke.o pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o 
crypt-mod-pgp-classic.o"
         fi
 
        AC_ARG_ENABLE(smime, [  --disable-smime            Disable SMIME 
support],
Index: pgp.c
===================================================================
RCS file: /home/roessler/cvs/mutt/pgp.c,v
retrieving revision 3.39
diff -u -r3.39 pgp.c
--- pgp.c       3 Feb 2005 18:44:27 -0000       3.39
+++ pgp.c       10 Feb 2005 17:37:53 -0000
@@ -2,6 +2,7 @@
  * Copyright (C) 1996,1997 Michael R. Elkins <me@xxxxxxxx>
  * Copyright (C) 1998,1999 Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxx>
  * Copyright (C) 2004 g10 Code GmbH
+ * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx>
  *
  *     This program is free software; you can redistribute it and/or modify
  *     it under the terms of the GNU General Public License as published by
@@ -35,6 +36,7 @@
 #include "pgp.h"
 #include "mime.h"
 #include "copy.h"
+#include "utf8chk.h"
 
 #include <sys/wait.h>
 #include <string.h>
@@ -407,10 +409,29 @@
       else if (pgpout)
       {
        FGETCONV *fc;
-       int c;
-       rewind (pgpout);
+       int c, valid_utf8 = 1;
+
+       {
+         struct is_valid_utf8_state u8s;
+
+         /* check if file is UTF-8, if it isn't, assume body character set */
+         is_valid_utf8_init(&u8s);
+         rewind (pgpout);
+         do {
+           c = fgetc(pgpout);
+           if (!is_valid_utf8(&u8s, c)) {
+             valid_utf8 = 0;
+             break;
+           }
+         } while (c != EOF);
+       }
+
+       dprint (1, (debugfile, "pgp.c:%d: pgpout is %s UTF-8, 
body_charset=\"%s\"\n", __LINE__, valid_utf8 ? "conformant" : "not", 
body_charset));
+
+       /* now decode */
+       rewind(pgpout);
        state_set_prefix (s);
-       fc = fgetconv_open (pgpout, "utf-8", Charset, 0);
+       fc = fgetconv_open (pgpout, valid_utf8 ? "utf-8" : body_charset, 
Charset, 0);
        while ((c = fgetconv (fc)) != EOF)
          state_prefix_putc (c, s);
        fgetconv_close (&fc);
--- /dev/null   2004-10-02 10:38:03.000000000 +0200
+++ utf8chk.h   2005-02-10 18:11:28.000000000 +0100
@@ -0,0 +1,31 @@
+/* utf8chk.h -- fast ANSI-C UTF-8 validator
+ * Copyright (C) 2005  Matthias Andree <matthias.andree@xxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef UTF8CHK_H
+#define UTF8CHK_H 1
+
+struct is_valid_utf8_state {
+    unsigned long out;
+    int count;
+    int icount;
+};
+
+extern void is_valid_utf8_init(struct is_valid_utf8_state *s);
+extern int is_valid_utf8(struct is_valid_utf8_state *s, int in);
+
+#endif
--- /dev/null   2004-10-02 10:38:03.000000000 +0200
+++ utf8chk.c   2005-02-10 18:11:47.000000000 +0100
@@ -0,0 +1,99 @@
+/* utf8chk.c -- fast ANSI-C UTF-8 validator
+ * Copyright (C) 2005  Matthias Andree <matthias.andree@xxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <stdio.h>
+#include "utf8chk.h"
+
+void is_valid_utf8_init(struct is_valid_utf8_state *s) {
+    s->out = 0;
+    s->count = s->icount = 0;
+}
+
+int is_valid_utf8(struct is_valid_utf8_state *s, int in) {
+    unsigned char c = (unsigned char)in;
+
+    if (in == EOF)
+       return s->icount == 0;
+
+    if (s->icount == 0) {
+       if (c < 0x80)
+           return 1;
+       if (c >= 0xfe)
+           return 0;
+       /* perhaps add s->count = 0; but for now assume
+        * that _init is called first */
+       if (c >= 0xfc)
+           s->count = 5, s->out = c & 0x1;
+       else if (c >= 0xf8)
+           s->count = 4, s->out = c & 0x3;
+       else if (c >= 0xf0)
+           s->count = 3, s->out = c & 0x7;
+       else if (c >= 0xe0)
+           s->count = 2, s->out = c & 0x0f;
+       /* c2 rather than c0 catches overlong sequences right away */
+       else if (c >= 0xc2)
+           s->count = 1, s->out = c & 0x1f;
+       s->icount = s->count;
+       return s->count != 0;
+    } else {
+       s->out <<= 6;
+       if (c < 0x80 || c >= 0xc0)
+               return 0;
+       }
+       s->out |= (c & 0x3f);
+       if (-- s->icount)
+           return 1;
+
+       if (s->out == 0xfffe || s->out == 0xffff)
+           return 0;
+       if (s->out >= 0xd800 && s->out < 0xe000)
+           return 0;
+       /* note this "overlong sequence" check does not detect
+        * 0x40...0x7f for 2-byte sequences, hence check that separately
+        * - luckily these invalid 2-byte sequences have 0xc0 or 0xc1 as
+        * their first byte */
+       if (s->out < (1u << (5 * s->count + 1)))
+           return 0;
+       return 1;
+}
+
+#ifdef TEST
+#include <stdlib.h>
+int main(int argc, char **argv) {
+    int x;
+    struct is_valid_utf8_state s;
+    unsigned long count = 0;
+
+    is_valid_utf8_init(&s);
+
+    while (1) {
+       x = fgetc(stdin);
+       if (!is_valid_utf8(&s, x)) {
+           printf("BAD character at position %lu\n", count);
+           exit(EXIT_FAILURE);
+       }
+
+       if (x == EOF) {
+           printf("OK\n");
+           break;
+       }
+       count ++;
+    }
+    return 0;
+}
+#endif

Attachment: pgpgb01OYljNi.pgp
Description: PGP signature

Follow-Ups:
- Re: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
  - From: Thomas Roessler

Prev by Date: Re: For 1.5.8: notalternates functionality
Next by Date: Is the mutt "raw CVS repository" available as snapshot or similar?
Previous by thread: Re: 1.5.7 BUG: character set in traditional PGP
Next by thread: Re: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
Index(es):
- Date
- Thread