<<< Date Index >>>     <<< Thread Index >>>

Re: IMAP server side search integration



On Monday, 05 September 2005 at 08:06, Brendan Cully wrote:
> On Monday, 05 September 2005 at 11:14, James Raftery wrote:
> > Hi,
> > 
> > On Sat, Sep 03, 2005 at 10:53:09AM -0700, Brendan Cully wrote:
> > > 1. Silently treat the arguments to ~(b|h|H) as simple strings and pass
> > >    them to the server. You'd lose the ability to do full-text regular
> > >    expression searches on IMAP folders. On the other hand, client-side
> > >    searches are currently painful enough that probably no one does
> > >    them anyway.
> > 
> > Please don't remove full-text regexp searches. Yes, they're very slow
> > and very inefficient but in some cases they're the only way to find what
> > you might be looking for. If a user is prepared to wait I think they
> > should be permitted to.
> > 
> > > 3. A modifier for ~b..., eg $~b or $~h, indicating that the parameters
> > >    are substrings rather than regular expressions. Would people
> > >    actually remember to use it or is it just a nuisance?
> > 
> > This would be my preference.
> 
> I'm working on this now.

This patch adds a '$' modifier to the pattern language that turns
regexp searches into simple string matches. Server-side search only
happens when $ is in effect.

Examples:
~b mutt: client regexp match
$~b mutt: server string match
$(~b mutt ~f brendan): server string match, client string match
$(~b mutt $~f brendan): server string match, client regexp match.

I'm going to wait to add docs until it seems we have a consensus that
this is the way to go...
diff -r 0946eb46a255 imap/command.c
--- a/imap/command.c    Mon Sep  5 11:23:32 2005
+++ b/imap/command.c    Mon Sep  5 09:20:30 2005
@@ -44,6 +44,7 @@
 static void cmd_parse_lsub (IMAP_DATA* idata, char* s);
 static void cmd_parse_fetch (IMAP_DATA* idata, char* s);
 static void cmd_parse_myrights (IMAP_DATA* idata, char* s);
+static void cmd_parse_search (IMAP_DATA* idata, char* s);
 
 static char *Capabilities[] = {
   "IMAP4",
@@ -116,6 +117,9 @@
                  cmd->blen));
     }
 
+    /* back up over '\0' */
+    if (len)
+      len--;
     c = mutt_socket_readln (cmd->buf + len, cmd->blen - len, idata->conn);
     if (c <= 0)
     {
@@ -367,6 +371,8 @@
     cmd_parse_lsub (idata, s);
   else if (ascii_strncasecmp ("MYRIGHTS", s, 8) == 0)
     cmd_parse_myrights (idata, s);
+  else if (ascii_strncasecmp ("SEARCH", s, 6) == 0)
+    cmd_parse_search (idata, s);
   else if (ascii_strncasecmp ("BYE", s, 3) == 0)
   {
     dprint (2, (debugfile, "Handling BYE\n"));
@@ -624,3 +630,36 @@
     s++;
   }
 }
+
+/* This should be optimised (eg with a tree or hash) */
+static int uid2msgno (IMAP_DATA* idata, unsigned int uid)
+{
+  int i;
+  
+  for (i = 0; i < idata->ctx->msgcount; i++)
+  {
+    HEADER* h = idata->ctx->hdrs[i];
+    if (HEADER_DATA(h)->uid == uid)
+      return i;
+  }
+  
+  return -1;
+}
+
+/* cmd_parse_search: store SEARCH response for later use */
+static void cmd_parse_search (IMAP_DATA* idata, char* s)
+{
+  unsigned int uid;
+  int msgno;
+
+  dprint (2, (debugfile, "Handling SEARCH\n"));
+
+  while ((s = imap_next_word (s)) && *s != '\0')
+  {
+    uid = atoi (s);
+    msgno = uid2msgno (idata, uid);
+    
+    if (msgno >= 0)
+      idata->ctx->hdrs[uid2msgno (idata, uid)]->matched = 1;
+  }
+}
diff -r 0946eb46a255 imap/imap.c
--- a/imap/imap.c       Mon Sep  5 11:23:32 2005
+++ b/imap/imap.c       Mon Sep  5 09:20:30 2005
@@ -1298,6 +1298,151 @@
   return msgcount;
 }
 
+/* returns number of patterns in the search that should be done server-side
+ * (eg are full-text) */
+static int do_search (const pattern_t* search, int allpats)
+{
+  int rc = 0;
+  const pattern_t* pat;
+
+  for (pat = search; pat; pat = pat->next)
+  {
+    switch (pat->op)
+    {
+      case M_BODY:
+      case M_HEADER:
+      case M_WHOLE_MSG:
+        if (pat->stringmatch)
+          rc++;
+        break;
+      default:
+        if (pat->child && do_search (pat->child, 1))
+          rc++;
+    }
+    
+    if (!allpats)
+      break;
+  }
+  
+  return rc;
+}
+
+/* convert mutt pattern_t to IMAP SEARCH command containing only elements
+ * that require full-text search (mutt already has what it needs for most
+ * match types, and does a better job (eg server doesn't support regexps). */
+static int imap_compile_search (const pattern_t* pat, BUFFER* buf)
+{
+  char term[STRING];
+
+  if (! do_search (pat, 0))
+    return 0;
+
+  if (pat->not)
+    mutt_buffer_addstr (buf, "NOT ");
+
+  if (pat->child)
+  {
+    int clauses;
+
+    if ((clauses = do_search (pat->child, 1)) > 0)
+    {
+      const pattern_t* clause = pat->child;
+
+      mutt_buffer_addch (buf, '(');
+
+      while (clauses)
+      {
+        if (do_search (clause, 0))
+        {
+          if (pat->op == M_OR && clauses > 1)
+            mutt_buffer_addstr (buf, "OR ");
+          clauses--;
+          
+          if (imap_compile_search (clause, buf) < 0)
+            return -1;
+
+          if (clauses)
+            mutt_buffer_addch (buf, ' ');
+          
+          clause = clause->next;
+        }
+      }
+
+      mutt_buffer_addch (buf, ')');
+    }
+  }
+  else
+  {
+    char *delim;
+
+    switch (pat->op)
+    {
+      case M_HEADER:
+        mutt_buffer_addstr (buf, "HEADER ");
+
+        /* extract header name */
+        if (! (delim = strchr (pat->str, ':')))
+        {
+          mutt_error (_("Header search without header name: %s"), pat->str);
+          return -1;
+        }
+        *delim = '\0';
+        imap_quote_string (term, sizeof (term), pat->str);
+        mutt_buffer_addstr (buf, term);
+        mutt_buffer_addch (buf, ' ');
+        
+        /* and field */
+        *delim = ':';
+        delim++;
+        SKIPWS(delim);
+        imap_quote_string (term, sizeof (term), delim);
+        mutt_buffer_addstr (buf, term);
+        break;
+      case M_BODY:
+        mutt_buffer_addstr (buf, "BODY ");
+        imap_quote_string (term, sizeof (term), pat->str);
+        mutt_buffer_addstr (buf, term);
+        break;
+      case M_WHOLE_MSG:
+        mutt_buffer_addstr (buf, "TEXT ");
+        imap_quote_string (term, sizeof (term), pat->str);
+        mutt_buffer_addstr (buf, term);
+        break;
+    }
+  }
+
+  return 0;
+}
+
+int imap_search (CONTEXT* ctx, const pattern_t* pat)
+{
+  BUFFER buf;
+  IMAP_DATA* idata = (IMAP_DATA*)ctx->data;
+  int i;
+
+  for (i = 0; i < ctx->msgcount; i++)
+    ctx->hdrs[i]->matched = 0;
+
+  if (!do_search (pat, 1))
+    return 0;
+
+  memset (&buf, 0, sizeof (buf));
+  mutt_buffer_addstr (&buf, "UID SEARCH ");
+  if (imap_compile_search (pat, &buf) < 0)
+  {
+    FREE (&buf.data);
+    return -1;
+  }
+  if (imap_exec (idata, buf.data, 0) < 0)
+  {
+    FREE (&buf.data);
+    return -1;
+  }
+
+  FREE (&buf.data);
+  return 0;
+}
+
 /* all this listing/browsing is a mess. I don't like that name is a pointer
  *   into idata->buf (used to be a pointer into the passed in buffer, just
  *   as bad), nor do I like the fact that the fetch is done here. This
diff -r 0946eb46a255 imap/imap.h
--- a/imap/imap.h       Mon Sep  5 11:23:32 2005
+++ b/imap/imap.h       Mon Sep  5 09:20:30 2005
@@ -41,6 +41,7 @@
 void imap_close_mailbox (CONTEXT *ctx);
 int imap_buffy_check (char *path);
 int imap_mailbox_check (char *path, int new);
+int imap_search (CONTEXT* ctx, const pattern_t* pat);
 int imap_subscribe (char *path, int subscribe);
 int imap_complete (char* dest, size_t dlen, char* path);
 
diff -r 0946eb46a255 mutt.h
--- a/mutt.h    Mon Sep  5 11:23:32 2005
+++ b/mutt.h    Mon Sep  5 09:20:30 2005
@@ -782,7 +782,8 @@
 
 
 /* flag to mutt_pattern_comp() */
-#define M_FULL_MSG     1       /* enable body and header matching */
+#define M_FULL_MSG     (1<<0)  /* enable body and header matching */
+#define M_STRINGMATCH   (1<<1)  /* use string match instead of regexp */
 
 typedef enum {
   M_MATCH_FULL_ADDRESS = 1
@@ -791,12 +792,14 @@
 typedef struct pattern_t
 {
   short op;
-  short not;
-  short alladdr;
+  unsigned int not : 1;
+  unsigned int alladdr : 1;
+  unsigned int stringmatch : 1;
   int min;
   int max;
   struct pattern_t *next;
   struct pattern_t *child;             /* arguments to logical op */
+  char *str;
   regex_t *rx;
 } pattern_t;
 
diff -r 0946eb46a255 pattern.c
--- a/pattern.c Mon Sep  5 11:23:32 2005
+++ b/pattern.c Mon Sep  5 09:20:30 2005
@@ -35,9 +35,15 @@
 
 #include "mutt_crypt.h"
 
+#ifdef USE_IMAP
+#include "mx.h"
+#include "imap/imap.h"
+#endif
+
 static int eat_regexp (pattern_t *pat, BUFFER *, BUFFER *);
 static int eat_date (pattern_t *pat, BUFFER *, BUFFER *);
 static int eat_range (pattern_t *pat, BUFFER *, BUFFER *);
+static int patmatch (const pattern_t *pat, const char *buf);
 
 struct pattern_flags
 {
@@ -136,7 +142,7 @@
 }
 
 static int
-msg_search (CONTEXT *ctx, regex_t *rx, int op, int msgno)
+msg_search (CONTEXT *ctx, pattern_t* pat, int msgno)
 {
   char tempfile[_POSIX_PATH_MAX];
   MESSAGE *msg = NULL;
@@ -164,10 +170,10 @@
        return (0);
       }
 
-      if (op != M_BODY)
+      if (pat->op != M_BODY)
        mutt_copy_header (msg->fp, h, s.fpout, CH_FROM | CH_DECODE, NULL);
 
-      if (op != M_HEADER)
+      if (pat->op != M_HEADER)
       {
        mutt_parse_mime_message (ctx, h);
 
@@ -197,14 +203,14 @@
     {
       /* raw header / body */
       fp = msg->fp;
-      if (op != M_BODY)
+      if (pat->op != M_BODY)
       {
        fseek (fp, h->offset, 0);
        lng = h->content->offset - h->offset;
       }
-      if (op != M_HEADER)
+      if (pat->op != M_HEADER)
       {
-       if (op == M_BODY)
+       if (pat->op == M_BODY)
          fseek (fp, h->content->offset, 0);
        lng += h->content->length;
       }
@@ -216,14 +222,14 @@
     /* search the file "fp" */
     while (lng > 0)
     {
-      if (op == M_HEADER)
+      if (pat->op == M_HEADER)
       {
        if (*(buf = mutt_read_rfc822_line (fp, buf, &blen)) == '\0')
          break;
       }
       else if (fgets (buf, blen - 1, fp) == NULL)
        break; /* don't loop forever */
-      if (regexec (rx, buf, 0, NULL, 0) == 0)
+      if (patmatch (pat, buf) == 0)
       {
        match = 1;
        break;
@@ -257,16 +263,26 @@
     snprintf (err->data, err->dsize, _("Error in expression: %s"), s->dptr);
     return (-1);
   }
-  pat->rx = safe_malloc (sizeof (regex_t));
-  r = REGCOMP (pat->rx, buf.data, REG_NEWLINE | REG_NOSUB | mutt_which_case 
(buf.data));
-  FREE (&buf.data);
-  if (r)
-  {
-    regerror (r, pat->rx, err->data, err->dsize);
-    regfree (pat->rx);
-    FREE (&pat->rx);
-    return (-1);
-  }
+
+  if (pat->stringmatch)
+  {
+    pat->str = safe_strdup (buf.data);
+    FREE (&buf.data);
+  }
+  else
+  {
+    pat->rx = safe_malloc (sizeof (regex_t));
+    r = REGCOMP (pat->rx, buf.data, REG_NEWLINE | REG_NOSUB | mutt_which_case 
(buf.data));
+    FREE (&buf.data);
+    if (r)
+    {
+      regerror (r, pat->rx, err->data, err->dsize);
+      regfree (pat->rx);
+      FREE (&pat->rx);
+      return (-1);
+    }
+  }
+
   return 0;
 }
 
@@ -666,6 +682,14 @@
   return 0;
 }
 
+static int patmatch (const pattern_t* pat, const char* buf)
+{
+  if (pat->stringmatch)
+    return !strstr (buf, pat->str);
+  else
+    return regexec (pat->rx, buf, 0, NULL, 0);
+}
+
 static struct pattern_flags *lookup_tag (char tag)
 {
   int i;
@@ -708,6 +732,7 @@
       regfree (tmp->rx);
       FREE (&tmp->rx);
     }
+    FREE (&tmp->str);
     if (tmp->child)
       mutt_pattern_free (&tmp->child);
     FREE (&tmp);
@@ -721,6 +746,7 @@
   pattern_t *last = NULL;
   int not = 0;
   int alladdr = 0;
+  int stringmatch = 0;
   int or = 0;
   int implicit = 1;    /* used to detect logical AND operator */
   struct pattern_flags *entry;
@@ -731,6 +757,9 @@
   memset (&ps, 0, sizeof (ps));
   ps.dptr = s;
   ps.dsize = mutt_strlen (s);
+
+  if (flags & M_STRINGMATCH)
+    stringmatch = 1;
 
   while (*ps.dptr)
   {
@@ -745,6 +774,10 @@
        ps.dptr++;
        not = !not;
        break;
+      case '$':
+        ps.dptr++;
+        stringmatch = !stringmatch;
+        break;
       case '|':
        if (!or)
        {
@@ -770,6 +803,7 @@
        implicit = 0;
        not = 0;
        alladdr = 0;
+        stringmatch = flags & M_STRINGMATCH ? 1 : 0;
        break;
       case '~':
        if (implicit && or)
@@ -786,8 +820,10 @@
        tmp = new_pattern ();
        tmp->not = not;
        tmp->alladdr = alladdr;
+        tmp->stringmatch = stringmatch;
        not = 0;
-       alladdr=0;
+       alladdr = 0;
+        stringmatch = flags & M_STRINGMATCH ? 1 : 0;
 
        if (last)
          last->next = tmp;
@@ -839,6 +875,10 @@
        }
        /* compile the sub-expression */
        buf = mutt_substrdup (ps.dptr + 1, p);
+        if (stringmatch)
+          flags |= M_STRINGMATCH;
+        else
+          flags &= ~M_STRINGMATCH;
        if ((tmp = mutt_pattern_comp (buf, flags, err)) == NULL)
        {
          FREE (&buf);
@@ -896,8 +936,7 @@
   return 0;
 }
 
-static int match_adrlist (regex_t *rx, int match_personal, int alladdr,
-                         int n, ...)
+static int match_adrlist (pattern_t *pat, int match_personal, int n, ...)
 {
   va_list ap;
   ADDRESS *a;
@@ -907,24 +946,22 @@
   {
     for (a = va_arg (ap, ADDRESS *) ; a ; a = a->next)
     {
-      if (alladdr^
-         ((a->mailbox && regexec (rx, a->mailbox, 0, NULL, 0) == 0) ||
-          (match_personal && a->personal &&
-           regexec (rx, a->personal, 0, NULL, 0) == 0)))
+      if (pat->alladdr ^ ((a->mailbox && patmatch (pat, a->mailbox) == 0) ||
+          (match_personal && a->personal && patmatch (pat, a->personal) == 0)))
       {
        va_end (ap);
-       return (! alladdr); /* Found match, or non-match if alladdr */
+       return (! pat->alladdr); /* Found match, or non-match if alladdr */
       }
     }
   }
   va_end (ap);
-  return alladdr; /* No matches, or all matches if alladdr */
-}
-
-static int match_reference (regex_t *rx, LIST *refs)
+  return pat->alladdr; /* No matches, or all matches if alladdr */
+}
+
+static int match_reference (pattern_t *pat, LIST *refs)
 {
   for (; refs; refs = refs->next)
-    if (regexec (rx, refs->data, 0, NULL, 0) == 0)
+    if (patmatch (pat, refs->data) == 0)
       return 1;
   return 0;
 }
@@ -1013,47 +1050,50 @@
     case M_BODY:
     case M_HEADER:
     case M_WHOLE_MSG:
-      return (pat->not ^ msg_search (ctx, pat->rx, pat->op, h->msgno));
+#ifdef USE_IMAP
+      /* IMAP search sets h->matched at search compile time */
+      if (Context->magic == M_IMAP && pat->stringmatch)
+       return (h->matched);
+#endif
+      return (pat->not ^ msg_search (ctx, pat, h->msgno));
     case M_SENDER:
-      return (pat->not ^ match_adrlist (pat->rx, flags & M_MATCH_FULL_ADDRESS,
-                                       pat->alladdr, 1, h->env->sender));
+      return (pat->not ^ match_adrlist (pat, flags & M_MATCH_FULL_ADDRESS, 1,
+                                        h->env->sender));
     case M_FROM:
-      return (pat->not ^ match_adrlist (pat->rx, flags & M_MATCH_FULL_ADDRESS,
-                                       pat->alladdr, 1, h->env->from));
+      return (pat->not ^ match_adrlist (pat, flags & M_MATCH_FULL_ADDRESS, 1,
+                                        h->env->from));
     case M_TO:
-      return (pat->not ^ match_adrlist (pat->rx, flags & M_MATCH_FULL_ADDRESS,
-                                       pat->alladdr, 1, h->env->to));
+      return (pat->not ^ match_adrlist (pat, flags & M_MATCH_FULL_ADDRESS, 1,
+                                        h->env->to));
     case M_CC:
-      return (pat->not ^ match_adrlist (pat->rx, flags & M_MATCH_FULL_ADDRESS,
-                                       pat->alladdr, 1, h->env->cc));
+      return (pat->not ^ match_adrlist (pat, flags & M_MATCH_FULL_ADDRESS, 1,
+                                        h->env->cc));
     case M_SUBJECT:
-      return (pat->not ^ (h->env && h->env->subject && regexec (pat->rx, 
h->env->subject, 0, NULL, 0) == 0));
+      return (pat->not ^ (h->env->subject && patmatch (pat, h->env->subject) 
== 0));
     case M_ID:
-      return (pat->not ^ (h->env && h->env->message_id && regexec (pat->rx, 
h->env->message_id, 0, NULL, 0) == 0));
+      return (pat->not ^ (h->env->message_id && patmatch (pat, 
h->env->message_id) == 0));
     case M_SCORE:
       return (pat->not ^ (h->score >= pat->min && (pat->max == M_MAXRANGE ||
                                                   h->score <= pat->max)));
     case M_SIZE:
       return (pat->not ^ (h->content->length >= pat->min && (pat->max == 
M_MAXRANGE || h->content->length <= pat->max)));
     case M_REFERENCE:
-      return (pat->not ^ match_reference (pat->rx, h->env->references));
+      return (pat->not ^ match_reference (pat, h->env->references));
     case M_ADDRESS:
-      return (pat->not ^ (h->env && match_adrlist (pat->rx, flags & 
M_MATCH_FULL_ADDRESS,
-                                       pat->alladdr, 4, h->env->from,
-                                       h->env->sender, h->env->to, 
h->env->cc)));
+      return (pat->not ^ match_adrlist (pat, flags & M_MATCH_FULL_ADDRESS, 4,
+                                        h->env->from, h->env->sender,
+                                        h->env->to, h->env->cc));
     case M_RECIPIENT:
-           return (pat->not ^ (h->env && match_adrlist (pat->rx, flags & 
M_MATCH_FULL_ADDRESS,
-                                       pat->alladdr, 2, h->env->to, 
h->env->cc)));
+           return (pat->not ^ match_adrlist (pat, flags & M_MATCH_FULL_ADDRESS,
+                                             2, h->env->to, h->env->cc));
     case M_LIST:       /* known list, subscribed or not */
-      return (pat->not ^ (h->env
-       && mutt_is_list_cc (pat->alladdr, h->env->to, h->env->cc)));
+      return (pat->not ^ mutt_is_list_cc (pat->alladdr, h->env->to, 
h->env->cc));
     case M_SUBSCRIBED_LIST:
-      return (pat->not ^ (h->env
-       && mutt_is_list_recipient (pat->alladdr, h->env->to, h->env->cc)));
+      return (pat->not ^ mutt_is_list_recipient (pat->alladdr, h->env->to, 
h->env->cc));
     case M_PERSONAL_RECIP:
-      return (pat->not ^ (h->env && match_user (pat->alladdr, h->env->to, 
h->env->cc)));
+      return (pat->not ^ match_user (pat->alladdr, h->env->to, h->env->cc));
     case M_PERSONAL_FROM:
-      return (pat->not ^ (h->env && match_user (pat->alladdr, h->env->from, 
NULL)));
+      return (pat->not ^ match_user (pat->alladdr, h->env->from, NULL));
     case M_COLLAPSED:
       return (pat->not ^ (h->collapsed && h->num_hidden > 1));
    case M_CRYPT_SIGN:
@@ -1073,9 +1113,9 @@
        break;
      return (pat->not ^ ((h->security & APPLICATION_PGP) && (h->security & 
PGPKEY)));
     case M_XLABEL:
-      return (pat->not ^ (h->env->x_label && regexec (pat->rx, 
h->env->x_label, 0, NULL, 0) == 0));
+      return (pat->not ^ (h->env->x_label && patmatch (pat, h->env->x_label) 
== 0));
     case M_HORMEL:
-      return (pat->not ^ (h->env->spam && h->env->spam->data && regexec 
(pat->rx, h->env->spam->data, 0, NULL, 0) == 0));
+      return (pat->not ^ (h->env->spam && h->env->spam->data && patmatch (pat, 
h->env->spam->data) == 0));
     case M_DUPLICATED:
       return (pat->not ^ (h->thread && h->thread->duplicate_thread));
     case M_UNREFERENCED:
@@ -1171,6 +1211,11 @@
     return (-1);
   }
 
+#ifdef USE_IMAP
+  if (Context->magic == M_IMAP && imap_search (Context, pat) < 0)
+    return -1;
+#endif
+  
   mutt_message _("Executing command on matching messages...");
 
 #define THIS_BODY Context->hdrs[i]->content
@@ -1303,6 +1348,10 @@
   {
     for (i = 0; i < Context->msgcount; i++)
       Context->hdrs[i]->searched = 0;
+#ifdef USE_IMAP
+    if (Context->magic == M_IMAP && imap_search (Context, SearchPattern) < 0)
+      return -1;
+#endif
     unset_option (OPTSEARCHINVALID);
   }
 

Attachment: pgpE1tAEpjkOV.pgp
Description: PGP signature