Re: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input

To: Matthias Andree <matthias.andree@xxxxxx>
Subject: Re: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
From: Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxx>
Date: Sat, 12 Feb 2005 21:35:23 +0100
Cc: mutt-dev@xxxxxxxx
In-reply-to: <m3vf90cnd3.fsf@xxxxxxxxxxxxxxxxxxxx>
Mail-followup-to: Matthias Andree <matthias.andree@xxxxxx>, mutt-dev@xxxxxxxx
References: <m3vf90cnd3.fsf@xxxxxxxxxxxxxxxxxxxx>
Sender: Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxxxxxxxxxxxx>
User-agent: Mutt/1.5.7i

Given the ongoing discussion, I'm not taking this into 1.5.8, or
into the CVS at this point.

On 2005-02-10 18:54:00 +0100, Matthias Andree wrote:
> From: Matthias Andree <matthias.andree@xxxxxx>
> To: mutt-dev@xxxxxxxx
> Date: Thu, 10 Feb 2005 18:54:00 +0100
> Subject: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
> X-Spam-Level: 
> 
> The attached patch fixes the bug I reported earlier today that spams
> Traditional Inline PGP 2.X with U+FFFD (as replacement for an invalid
> UTF-8 sequence) where Latin 1 national characters had been.
> 
> It works like this:
> 
> Rather than assuming the PGP output file is in UTF-8 format,
> 1. check if it really is valid UTF-8
> 2a. if it is, behavior is unchanged
> 2b. if it is NOT UTF-8, assume PGP output is in body_charset,
>     as suggested by Tamotsu Takahashi.
> 
> TEST STATUS: works for me.
> 
> -- 
> Matthias Andree

Content-Description: PGP fallback to body_charset for non-UTF-8 input
> Index: Makefile.am
> ===================================================================
> RCS file: /home/roessler/cvs/mutt/Makefile.am,v
> retrieving revision 3.29
> diff -u -r3.29 Makefile.am
> --- Makefile.am       4 Feb 2005 16:54:13 -0000       3.29
> +++ Makefile.am       10 Feb 2005 17:37:52 -0000
> @@ -64,7 +64,8 @@
>       browser.h mbyte.h remailer.h url.h mutt_ssl_nss.c \
>       crypt-mod-pgp-classic.c crypt-mod-smime-classic.c \
>       pgppacket.c mutt_idna.h hcache.c mutt_ssl_gnutls.c \
> -     crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c
> +     crypt-gpgme.c crypt-mod-pgp-gpgme.c crypt-mod-smime-gpgme.c \
> +     utf8chk.h utf8chk.c
>  
>  EXTRA_DIST = COPYRIGHT GPL OPS OPS.PGP OPS.CRYPT OPS.SMIME TODO \
>       configure acconfig.h account.h \
> Index: configure.in
> ===================================================================
> RCS file: /home/roessler/cvs/mutt/configure.in,v
> retrieving revision 3.21
> diff -u -r3.21 configure.in
> --- configure.in      31 Jan 2005 02:40:14 -0000      3.21
> +++ configure.in      10 Feb 2005 17:37:53 -0000
> @@ -130,7 +130,7 @@
>                  AC_DEFINE(CRYPT_BACKEND_CLASSIC_PGP,1,
>                      [ Define if you want classic PGP support. ])
>                  PGPAUX_TARGET="pgpring pgpewrap"
> -                MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o pgpinvoke.o 
> pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o 
> crypt-mod-pgp-classic.o"
> +                MUTT_LIB_OBJECTS="$MUTT_LIB_OBJECTS pgp.o utf8chk.o 
> pgpinvoke.o pgpkey.o pgplib.o gnupgparse.o pgpmicalg.o pgppacket.o 
> crypt-mod-pgp-classic.o"
>          fi
>  
>       AC_ARG_ENABLE(smime, [  --disable-smime            Disable SMIME 
> support],
> Index: pgp.c
> ===================================================================
> RCS file: /home/roessler/cvs/mutt/pgp.c,v
> retrieving revision 3.39
> diff -u -r3.39 pgp.c
> --- pgp.c     3 Feb 2005 18:44:27 -0000       3.39
> +++ pgp.c     10 Feb 2005 17:37:53 -0000
> @@ -2,6 +2,7 @@
>   * Copyright (C) 1996,1997 Michael R. Elkins <me@xxxxxxxx>
>   * Copyright (C) 1998,1999 Thomas Roessler <roessler@xxxxxxxxxxxxxxxxxx>
>   * Copyright (C) 2004 g10 Code GmbH
> + * Copyright (C) 2005 Matthias Andree <matthias.andree@xxxxxx>
>   *
>   *     This program is free software; you can redistribute it and/or modify
>   *     it under the terms of the GNU General Public License as published by
> @@ -35,6 +36,7 @@
>  #include "pgp.h"
>  #include "mime.h"
>  #include "copy.h"
> +#include "utf8chk.h"
>  
>  #include <sys/wait.h>
>  #include <string.h>
> @@ -407,10 +409,29 @@
>        else if (pgpout)
>        {
>       FGETCONV *fc;
> -     int c;
> -     rewind (pgpout);
> +     int c, valid_utf8 = 1;
> +
> +     {
> +       struct is_valid_utf8_state u8s;
> +
> +       /* check if file is UTF-8, if it isn't, assume body character set */
> +       is_valid_utf8_init(&u8s);
> +       rewind (pgpout);
> +       do {
> +         c = fgetc(pgpout);
> +         if (!is_valid_utf8(&u8s, c)) {
> +           valid_utf8 = 0;
> +           break;
> +         }
> +       } while (c != EOF);
> +     }
> +
> +     dprint (1, (debugfile, "pgp.c:%d: pgpout is %s UTF-8, 
> body_charset=\"%s\"\n", __LINE__, valid_utf8 ? "conformant" : "not", 
> body_charset));
> +
> +     /* now decode */
> +     rewind(pgpout);
>       state_set_prefix (s);
> -     fc = fgetconv_open (pgpout, "utf-8", Charset, 0);
> +     fc = fgetconv_open (pgpout, valid_utf8 ? "utf-8" : body_charset, 
> Charset, 0);
>       while ((c = fgetconv (fc)) != EOF)
>         state_prefix_putc (c, s);
>       fgetconv_close (&fc);
> --- /dev/null 2004-10-02 10:38:03.000000000 +0200
> +++ utf8chk.h 2005-02-10 18:11:28.000000000 +0100
> @@ -0,0 +1,31 @@
> +/* utf8chk.h -- fast ANSI-C UTF-8 validator
> + * Copyright (C) 2005  Matthias Andree <matthias.andree@xxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or (at
> + * your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#ifndef UTF8CHK_H
> +#define UTF8CHK_H 1
> +
> +struct is_valid_utf8_state {
> +    unsigned long out;
> +    int count;
> +    int icount;
> +};
> +
> +extern void is_valid_utf8_init(struct is_valid_utf8_state *s);
> +extern int is_valid_utf8(struct is_valid_utf8_state *s, int in);
> +
> +#endif
> --- /dev/null 2004-10-02 10:38:03.000000000 +0200
> +++ utf8chk.c 2005-02-10 18:11:47.000000000 +0100
> @@ -0,0 +1,99 @@
> +/* utf8chk.c -- fast ANSI-C UTF-8 validator
> + * Copyright (C) 2005  Matthias Andree <matthias.andree@xxxxxx>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or (at
> + * your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
> + */
> +
> +#include <stdio.h>
> +#include "utf8chk.h"
> +
> +void is_valid_utf8_init(struct is_valid_utf8_state *s) {
> +    s->out = 0;
> +    s->count = s->icount = 0;
> +}
> +
> +int is_valid_utf8(struct is_valid_utf8_state *s, int in) {
> +    unsigned char c = (unsigned char)in;
> +
> +    if (in == EOF)
> +     return s->icount == 0;
> +
> +    if (s->icount == 0) {
> +     if (c < 0x80)
> +         return 1;
> +     if (c >= 0xfe)
> +         return 0;
> +     /* perhaps add s->count = 0; but for now assume
> +      * that _init is called first */
> +     if (c >= 0xfc)
> +         s->count = 5, s->out = c & 0x1;
> +     else if (c >= 0xf8)
> +         s->count = 4, s->out = c & 0x3;
> +     else if (c >= 0xf0)
> +         s->count = 3, s->out = c & 0x7;
> +     else if (c >= 0xe0)
> +         s->count = 2, s->out = c & 0x0f;
> +     /* c2 rather than c0 catches overlong sequences right away */
> +     else if (c >= 0xc2)
> +         s->count = 1, s->out = c & 0x1f;
> +     s->icount = s->count;
> +     return s->count != 0;
> +    } else {
> +     s->out <<= 6;
> +     if (c < 0x80 || c >= 0xc0)
> +             return 0;
> +     }
> +     s->out |= (c & 0x3f);
> +     if (-- s->icount)
> +         return 1;
> +
> +     if (s->out == 0xfffe || s->out == 0xffff)
> +         return 0;
> +     if (s->out >= 0xd800 && s->out < 0xe000)
> +         return 0;
> +     /* note this "overlong sequence" check does not detect
> +      * 0x40...0x7f for 2-byte sequences, hence check that separately
> +      * - luckily these invalid 2-byte sequences have 0xc0 or 0xc1 as
> +      * their first byte */
> +     if (s->out < (1u << (5 * s->count + 1)))
> +         return 0;
> +     return 1;
> +}
> +
> +#ifdef TEST
> +#include <stdlib.h>
> +int main(int argc, char **argv) {
> +    int x;
> +    struct is_valid_utf8_state s;
> +    unsigned long count = 0;
> +
> +    is_valid_utf8_init(&s);
> +
> +    while (1) {
> +     x = fgetc(stdin);
> +     if (!is_valid_utf8(&s, x)) {
> +         printf("BAD character at position %lu\n", count);
> +         exit(EXIT_FAILURE);
> +     }
> +
> +     if (x == EOF) {
> +         printf("OK\n");
> +         break;
> +     }
> +     count ++;
> +    }
> +    return 0;
> +}
> +#endif




-- 
Thomas Roessler · Personal soap box at <http://log.does-not-exist.org/>.

Attachment: pgpVUFyHX8SBC.pgp
Description: PGP signature

Follow-Ups:
- Re: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
  - From: Matthias Andree

References:
- For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
  - From: Matthias Andree

Prev by Date: Re: 1.5.7 BUG: character set in traditional PGP
Next by Date: Re: For 1.5.8: fmtstring, assumed, filecharset, iconvhook, flags and manual
Previous by thread: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
Next by thread: Re: For 1.5.8: PGP fallback to body_charset for non-UTF-8 input
Index(es):
- Date
- Thread