<<< Date Index >>>     <<< Thread Index >>>

Re: IMAP server side search integration



On Saturday, 03 September 2005 at 23:54, Derek Martin wrote:
> On Sat, Sep 03, 2005 at 10:53:09AM -0700, Brendan Cully wrote:
> > 1. Silently treat the arguments to ~(b|h|H) as simple strings and pass
> >    them to the server. You'd lose the ability to do full-text regular
> >    expression searches on IMAP folders. On the other hand, client-side
> >    searches are currently painful enough that probably no one does
> >    them anyway.
> 
> This is probably the easiest to code... but the down side is it
> introduces yet another folder-format-specific inconsistency in mutt's
> UI, which is something that I personally don't like.  I strongly
> believe that the data format should have no impact on the behavior of
> the UI, ever.

I would like to agree, but in practice some formats make some
operations much cheaper or more expensive than others. To really
preserve consistency we'd just keep the status quo - super slow
client-side full text searches. When performance gets bad enough I
think it's probably worth sacrificing some consistency.

> > 2. Three more pattern commands. Since there aren't many letters left,
> >    these would end up being pretty obscure, and normal body searches
> >    would still have to fetch every message. I don't really like this
> >    option.
> 
> But, from the standpoint of UI consistency, this is the best option.
> I also think it would be useful to have a substing search feature for
> other formats... It is generally less expensive than regex, and I
> think more often than not substrings are what people want to search
> for.  Thus it is an all-around win, IMO.

I'm surprised this one got a vote. It was far and away my least
favourite.

> Note that in truth, I never use this option whatever, so my advocacy
> is based purely on what I think The Right Way is.  Of course,
> different opinions can be valid too.
> 
> > 3. A modifier for ~b..., eg $~b or $~h, indicating that the parameters
> >    are substrings rather than regular expressions. Would people
> >    actually remember to use it or is it just a nuisance?
> 
> To my eye, this seems the least desireable of the options you present.
> It seems needlessly complex for the user.

This option got a couple votes, and I prefer it to three new letters,
so I'll probably hack it up. In the mean time, if anyone wants to play
with it, here's a patch that implements option 1 (silently using
strings instead of regexps), just because, as you guessed, it was the
simplest to code.

I haven't done a whole lot of testing with it yet, and I have my
doubts about whether it interacts properly with complex server/client
mixed clauses. But it seems to do the trick for the few simple cases I
tried.
# HG changeset patch
# User Brendan Cully <brendan@xxxxxxxxxx>
# Node ID 9e7f6e2fd959d28e0f84b925fb4d831f5f52a037
# Parent  796908c4398ba505e4d0c2c8d48257c506aeabeb
First draft of server-side searching

diff -r 796908c4398b -r 9e7f6e2fd959 imap/command.c
--- a/imap/command.c    Sun Sep  4 21:18:50 2005
+++ b/imap/command.c    Mon Sep  5 03:16:32 2005
@@ -44,6 +44,7 @@
 static void cmd_parse_lsub (IMAP_DATA* idata, char* s);
 static void cmd_parse_fetch (IMAP_DATA* idata, char* s);
 static void cmd_parse_myrights (IMAP_DATA* idata, char* s);
+static void cmd_parse_search (IMAP_DATA* idata, char* s);
 
 static char *Capabilities[] = {
   "IMAP4",
@@ -367,6 +368,8 @@
     cmd_parse_lsub (idata, s);
   else if (ascii_strncasecmp ("MYRIGHTS", s, 8) == 0)
     cmd_parse_myrights (idata, s);
+  else if (ascii_strncasecmp ("SEARCH", s, 6) == 0)
+    cmd_parse_search (idata, s);
   else if (ascii_strncasecmp ("BYE", s, 3) == 0)
   {
     dprint (2, (debugfile, "Handling BYE\n"));
@@ -624,3 +627,33 @@
     s++;
   }
 }
+
+/* This should be optimised (eg with a tree or hash) */
+static int uid2msgno (IMAP_DATA* idata, unsigned int uid)
+{
+  int i;
+  
+  for (i = 0; i < idata->ctx->msgcount; i++)
+  {
+    HEADER* h = idata->ctx->hdrs[i];
+    if (HEADER_DATA(h)->uid == uid)
+      break;
+  }
+  
+  return i;
+}
+
+/* cmd_parse_search: store SEARCH response for later use */
+static void cmd_parse_search (IMAP_DATA* idata, char* s)
+{
+  unsigned int uid;
+
+  dprint (2, (debugfile, "Handling SEARCH\n"));
+
+  while ((s = imap_next_word (s)) && *s != '\0')
+  {
+    uid = atoi (s);
+    
+    idata->ctx->hdrs[uid2msgno (idata, uid)]->matched = 1;
+  }
+}
diff -r 796908c4398b -r 9e7f6e2fd959 imap/imap.c
--- a/imap/imap.c       Sun Sep  4 21:18:50 2005
+++ b/imap/imap.c       Mon Sep  5 03:16:32 2005
@@ -1298,6 +1298,149 @@
   return msgcount;
 }
 
+/* returns number of patterns in the search that should be done server-side
+ * (eg are full-text) */
+static int do_search (const pattern_t* search, int allpats)
+{
+  int rc = 0;
+  const pattern_t* pat;
+
+  for (pat = search; pat; pat = pat->next)
+  {
+    switch (pat->op)
+    {
+      case M_BODY:
+      case M_HEADER:
+      case M_WHOLE_MSG:
+        rc++;
+        break;
+      default:
+        if (pat->child && do_search (pat->child, 1))
+          rc++;
+    }
+    
+    if (!allpats)
+      break;
+  }
+  
+  return rc;
+}
+
+/* convert mutt pattern_t to IMAP SEARCH command containing only elements
+ * that require full-text search (mutt already has what it needs for most
+ * match types, and does a better job (eg server doesn't support regexps). */
+static int imap_compile_search (const pattern_t* search, BUFFER* buf)
+{
+  char term[STRING];
+
+  if (do_search (search, 0) && search->not)
+    mutt_buffer_addstr (buf, "NOT ");
+
+  if (search->child)
+  {
+    int clauses;
+
+    if ((clauses = do_search (search->child, 1)) > 0)
+    {
+      const pattern_t* clause = search->child;
+
+      mutt_buffer_addch (buf, '(');
+
+      while (clauses)
+      {
+        if (do_search (clause, 0))
+        {
+          if (search->op == M_OR && clauses > 1)
+            mutt_buffer_addstr (buf, "OR ");
+          clauses--;
+          
+          if (imap_compile_search (clause, buf) < 0)
+            return -1;
+
+          if (clauses)
+            mutt_buffer_addch (buf, ' ');
+          
+          clause = clause->next;
+        }
+      }
+
+      mutt_buffer_addch (buf, ')');
+    }
+  }
+  else
+  {
+    char *delim;
+
+    switch (search->op)
+    {
+      case M_HEADER:
+        mutt_buffer_addstr (buf, "HEADER ");
+
+        /* extract header name */
+        if (! (delim = strchr (search->substr, ':')))
+        {
+          mutt_error (_("Header search without header name: %s"), 
search->substr);
+          return -1;
+        }
+        *delim = '\0';
+        imap_quote_string (term, sizeof (term), search->substr);
+        mutt_buffer_addstr (buf, term);
+        mutt_buffer_addch (buf, ' ');
+        
+        /* and field */
+        *delim = ':';
+        delim++;
+        SKIPWS(delim);
+        imap_quote_string (term, sizeof (term), delim);
+        mutt_buffer_addstr (buf, term);
+        break;
+      case M_BODY:
+        mutt_buffer_addstr (buf, "BODY ");
+        imap_quote_string (term, sizeof (term), search->substr);
+        mutt_buffer_addstr (buf, term);
+        break;
+      case M_WHOLE_MSG:
+        mutt_buffer_addstr (buf, "TEXT ");
+        imap_quote_string (term, sizeof (term), search->substr);
+        mutt_buffer_addstr (buf, term);
+        break;
+      default:
+        return 0;
+    }
+  }
+
+  return 0;
+}
+
+int imap_search (CONTEXT* ctx, const pattern_t* search)
+{
+  BUFFER buf;
+  IMAP_DATA* idata = (IMAP_DATA*)ctx->data;
+  int i;
+
+  for (i = 0; i < ctx->msgcount; i++)
+    ctx->hdrs[i]->matched = 0;
+
+  if (!do_search (search, 1))
+    return 0;
+
+  memset (&buf, 0, sizeof (buf));
+  mutt_buffer_addstr (&buf, "UID SEARCH ");
+  if (imap_compile_search (search, &buf) < 0)
+  {
+    FREE (&buf.data);
+    return -1;
+  }
+  if (imap_exec (idata, buf.data, 0) < 0)
+  {
+    FREE (&buf.data);
+    return -1;
+  }
+
+  FREE (&buf.data);
+  return 0;
+}
+
 /* all this listing/browsing is a mess. I don't like that name is a pointer
  *   into idata->buf (used to be a pointer into the passed in buffer, just
  *   as bad), nor do I like the fact that the fetch is done here. This
diff -r 796908c4398b -r 9e7f6e2fd959 imap/imap.h
--- a/imap/imap.h       Sun Sep  4 21:18:50 2005
+++ b/imap/imap.h       Mon Sep  5 03:16:32 2005
@@ -41,6 +41,7 @@
 void imap_close_mailbox (CONTEXT *ctx);
 int imap_buffy_check (char *path);
 int imap_mailbox_check (char *path, int new);
+int imap_search (CONTEXT* ctx, const pattern_t* search);
 int imap_subscribe (char *path, int subscribe);
 int imap_complete (char* dest, size_t dlen, char* path);
 
diff -r 796908c4398b -r 9e7f6e2fd959 mutt.h
--- a/mutt.h    Sun Sep  4 21:18:50 2005
+++ b/mutt.h    Mon Sep  5 03:16:32 2005
@@ -798,6 +798,10 @@
   struct pattern_t *next;
   struct pattern_t *child;             /* arguments to logical op */
   regex_t *rx;
+#ifdef USE_IMAP
+  /* server-side search only understands substring searches, not regexp */
+  char *substr;
+#endif
 } pattern_t;
 
 typedef struct
diff -r 796908c4398b -r 9e7f6e2fd959 pattern.c
--- a/pattern.c Sun Sep  4 21:18:50 2005
+++ b/pattern.c Mon Sep  5 03:16:32 2005
@@ -34,6 +34,13 @@
 #include <stdarg.h>
 
 #include "mutt_crypt.h"
+
+#ifdef USE_IMAP
+#include "mx.h"
+#include "imap/imap.h"
+
+static int eat_substr (pattern_t *pat, BUFFER *, BUFFER *);
+#endif
 
 static int eat_regexp (pattern_t *pat, BUFFER *, BUFFER *);
 static int eat_date (pattern_t *pat, BUFFER *, BUFFER *);
@@ -245,6 +252,23 @@
   return match;
 }
 
+static int eat_substr (pattern_t* pat, BUFFER *s, BUFFER *err)
+{
+  BUFFER buf;
+  
+  memset (&buf, 0, sizeof (buf));
+  if (mutt_extract_token (&buf, s, M_TOKEN_PATTERN) != 0 || !buf.data)
+  {
+    snprintf (err->data, err->dsize, _("Error in expression: %s"), s->dptr);
+    return -1;
+  }
+  
+  pat->substr = safe_strdup (buf.data);
+  FREE (&buf.data);
+  
+  return 0;
+}
+
 int eat_regexp (pattern_t *pat, BUFFER *s, BUFFER *err)
 {
   BUFFER buf;
@@ -708,6 +732,9 @@
       regfree (tmp->rx);
       FREE (&tmp->rx);
     }
+#ifdef USE_IMAP
+    FREE (&tmp->substr);
+#endif
     if (tmp->child)
       mutt_pattern_free (&tmp->child);
     FREE (&tmp);
@@ -821,6 +848,16 @@
            mutt_pattern_free (&curlist);
            return NULL;
          }
+#ifdef USE_IMAP
+          if (Context && Context->magic == M_IMAP && entry->class == 
M_FULL_MSG)
+          {
+            if (eat_substr (tmp, &ps, err) == -1)
+            {
+              mutt_pattern_free (&curlist);
+              return NULL;
+            }
+          } else
+#endif
          if (entry->eat_arg (tmp, &ps, err) == -1)
          {
            mutt_pattern_free (&curlist);
@@ -1013,6 +1050,10 @@
     case M_BODY:
     case M_HEADER:
     case M_WHOLE_MSG:
+#ifdef USE_IMAP
+      /* IMAP search sets h->matched at search compile time */
+      return (h->matched);
+#endif
       return (pat->not ^ msg_search (ctx, pat->rx, pat->op, h->msgno));
     case M_SENDER:
       return (pat->not ^ match_adrlist (pat->rx, flags & M_MATCH_FULL_ADDRESS,
@@ -1171,6 +1212,11 @@
     return (-1);
   }
 
+#ifdef USE_IMAP
+  if (Context->magic == M_IMAP && imap_search (Context, pat) < 0)
+    return -1;
+#endif
+  
   mutt_message _("Executing command on matching messages...");
 
 #define THIS_BODY Context->hdrs[i]->content
@@ -1303,6 +1349,10 @@
   {
     for (i = 0; i < Context->msgcount; i++)
       Context->hdrs[i]->searched = 0;
+#ifdef USE_IMAP
+    if (Context->magic == M_IMAP && imap_search (Context, SearchPattern) < 0)
+      return -1;
+#endif
     unset_option (OPTSEARCHINVALID);
   }
 

Attachment: pgpdV0LGk2Pke.pgp
Description: PGP signature