<<< Date Index >>>     <<< Thread Index >>>

Re: Length of regex/pattern for "color"



On Thu, Oct 23, 2003 at 03:48:20PM -0400, parv wrote:
> Yes, it is quite possible to shorten the regex as above.  That was
> what i was also thinking of yesterday night.
> That would help some, but not quite.  Above is an example only for
> 62 perl ports; i have 306 in total.

Yeah.. sounds like you really do need the HUGE_STRING or whatever, no
matter how much the regex is reduced in size.

> Your effort is welcomed too.

Ok.. I did write a small python script that uses a relatively simple
method to reduce the regex length.  I have the feeling that its
effectiveness could be improved, perhaps by changing to a character by
character tree or something.  Anyway, the problem is harder than I first
thought, so I'll probably just leave it as it is. :-)

Cheerio,
 Allister

-- 
Allister MacLeod <amacleod@xxxxxxxx>
 Elen síla lúmenn'omentielvo.
#!/usr/bin/python

import string

ExampleRegex="(archivers/p5-Archive-Tar|archivers/p5-Compress-Zlib|devel/p5-AppConfig|devel/p5-Class-Factory-Util|devel/p5-Class-Singleton|devel/p5-Config-General|devel/p5-Config-Ini|devel/p5-Date-Calc|devel/p5-Date-ICal|devel/p5-Date-ISO|devel/p5-Date-Leapyear|devel/p5-Date-Manip|devel/p5-Date-Pcalc|devel/p5-DateConvert|devel/p5-DateTime|devel/p5-DateTime-Locale|devel/p5-DateTime-TimeZone|devel/p5-ExtUtils-ParseXS|devel/p5-File-Temp|devel/p5-Inline|devel/p5-Inline-CPP|devel/p5-Locale-Maketext|devel/p5-Memoize|devel/p5-Module-Build|devel/p5-Params-Validate|devel/p5-Parse-RecDescent|devel/p5-Storable|devel/p5-Test-Harness|devel/p5-Test-Inline|devel/p5-Test-Simple|devel/p5-Tie-IxHash|devel/p5-Time-HiRes|devel/p5-Time-Local|devel/p5-Time-modules|devel/p5-TimeDate|dns/p5-Net-DNS|mail/p5-Mail-SpamAssassin|mail/p5-Mail-Tools|math/p5-Bit-Vector|misc/p5-I18N-LangTags|net/p5-URI|security/p5-Digest-HMAC|security/p5-Digest-MD5|security/p5-Digest-Nilsimsa|security/p5-Digest-SHA1|textproc/p5-FreeBSD-Ports|textproc/p5-Text-Balanced|textproc/p5-Text-Template|textproc/p5-YAML|www/p5-CGI-Application|www/p5-CGI-Kwiki|www/p5-CGI-modules|www/p5-CGI-Session|www/p5-CGI.pm|www/p5-HTML-Parser|www/p5-HTML-Tagset|www/p5-HTML-Template|www/p5-HTML-Tree|www/p5-libwww|www/p5-Template-Toolkit|x11-toolkits/p5-Tk|x11/p5-X11-Protocol)"

MaxTries=10
Verbose=0

def main():
    regex=ExampleRegex
    terms=regex.strip('()').split('|')
    exp=reduce_brutish(terms)
    print exp

def reduce_brutish(terms):
    last="(%s)"%('|'.join(terms))
    for i in range(MaxTries):
        commons=find_common_prefixes(terms)
        terms=prefix_terms(commons)
        exp="(%s)"%('|'.join(terms))
        l=len(exp)
        ll=len(last)
        if Verbose:
            print "len:%d (prev:%d)"%(l,ll)
            if Verbose>1:
                print exp
        if l>ll:
            exp=last
            break
        last=exp
    return exp

def find_common_prefixes(terms):
    """
    return a dict containing {prefix:[remainder,...]} mappings
    singletons will always show up as {term:[""]}
    """
    commons={}
    for t in terms:
        i=terms.index(t)
        l=0
        cc=""
        for ot in terms[:i]+terms[i+1:]:
            c=common(t,ot)
            if len(c)>l:
                l=len(c)
                cc=c
        rest=t[l:]
        if cc=="":
            commons[t]=[""]
        elif commons.has_key(cc):
            commons[cc].append(rest)
        else:
            commons[cc]=[rest]
    for k in commons:
        if len(commons[k])==1:
            term=k+commons[k][0]
            del(commons[k])
            commons[term]=[""]
    return commons

def prefix_terms(commons):
    """
    turn a common prefixes dict into a list of terms
    with the form prefix(rest1|rest2|...)
    """
    terms_out=[]
    for k in commons:
        rests=commons[k]
        if len(rests)==1: #singleton
            term="%s%s"%(k,rests[0])
        else:
            term="%s(%s)"%(k,'|'.join(commons[k]))
        terms_out.append(term)
    terms_out.sort()
    return terms_out

def diverge(a,b):
    "find index of divergence of 2 similar strings"
    l=min(len(a),len(b))
    for i in range(l):
        if a[i]!=b[i]:
            return i
    return l

def common(a,b):
    """
    return the common substring of 2 strings with similar beginnings
    end also at '('
    """
    if len(a)>len(b):
        shorter=b
    else:
        shorter=a
    for i in range(len(shorter)):
        if a[i]!=b[i] or a[i]=='(':
            return a[:i]
    return shorter

def before_and_after(terms):
    mcs=most_common_substr(terms)
    single=[]
    ba=[]
    for t in terms:
        i=t.find(mcs)
        if i==-1:
            single.append(t)
        else:
            ba.append((t[:i],t[i+len(mcs):]))
    return (single,ba)

def most_common_substr(terms):
    "this function should be far more complex, not an ugly heuristic like this"
    return "p5"

if __name__=='__main__':
    main()