Re: Length of regex/pattern for "color"
On Thu, Oct 23, 2003 at 03:48:20PM -0400, parv wrote:
> Yes, it is quite possible to shorten the regex as above. That was
> what i was also thinking of yesterday night.
> That would help some, but not quite. Above is an example only for
> 62 perl ports; i have 306 in total.
Yeah.. sounds like you really do need the HUGE_STRING or whatever, no
matter how much the regex is reduced in size.
> Your effort is welcomed too.
Ok.. I did write a small python script that uses a relatively simple
method to reduce the regex length. I have the feeling that its
effectiveness could be improved, perhaps by changing to a character by
character tree or something. Anyway, the problem is harder than I first
thought, so I'll probably just leave it as it is. :-)
Cheerio,
Allister
--
Allister MacLeod <amacleod@xxxxxxxx>
Elen síla lúmenn'omentielvo.
#!/usr/bin/python
import string
ExampleRegex="(archivers/p5-Archive-Tar|archivers/p5-Compress-Zlib|devel/p5-AppConfig|devel/p5-Class-Factory-Util|devel/p5-Class-Singleton|devel/p5-Config-General|devel/p5-Config-Ini|devel/p5-Date-Calc|devel/p5-Date-ICal|devel/p5-Date-ISO|devel/p5-Date-Leapyear|devel/p5-Date-Manip|devel/p5-Date-Pcalc|devel/p5-DateConvert|devel/p5-DateTime|devel/p5-DateTime-Locale|devel/p5-DateTime-TimeZone|devel/p5-ExtUtils-ParseXS|devel/p5-File-Temp|devel/p5-Inline|devel/p5-Inline-CPP|devel/p5-Locale-Maketext|devel/p5-Memoize|devel/p5-Module-Build|devel/p5-Params-Validate|devel/p5-Parse-RecDescent|devel/p5-Storable|devel/p5-Test-Harness|devel/p5-Test-Inline|devel/p5-Test-Simple|devel/p5-Tie-IxHash|devel/p5-Time-HiRes|devel/p5-Time-Local|devel/p5-Time-modules|devel/p5-TimeDate|dns/p5-Net-DNS|mail/p5-Mail-SpamAssassin|mail/p5-Mail-Tools|math/p5-Bit-Vector|misc/p5-I18N-LangTags|net/p5-URI|security/p5-Digest-HMAC|security/p5-Digest-MD5|security/p5-Digest-Nilsimsa|security/p5-Digest-SHA1|textproc/p5-FreeBSD-Ports|textproc/p5-Text-Balanced|textproc/p5-Text-Template|textproc/p5-YAML|www/p5-CGI-Application|www/p5-CGI-Kwiki|www/p5-CGI-modules|www/p5-CGI-Session|www/p5-CGI.pm|www/p5-HTML-Parser|www/p5-HTML-Tagset|www/p5-HTML-Template|www/p5-HTML-Tree|www/p5-libwww|www/p5-Template-Toolkit|x11-toolkits/p5-Tk|x11/p5-X11-Protocol)"
MaxTries=10
Verbose=0
def main():
regex=ExampleRegex
terms=regex.strip('()').split('|')
exp=reduce_brutish(terms)
print exp
def reduce_brutish(terms):
last="(%s)"%('|'.join(terms))
for i in range(MaxTries):
commons=find_common_prefixes(terms)
terms=prefix_terms(commons)
exp="(%s)"%('|'.join(terms))
l=len(exp)
ll=len(last)
if Verbose:
print "len:%d (prev:%d)"%(l,ll)
if Verbose>1:
print exp
if l>ll:
exp=last
break
last=exp
return exp
def find_common_prefixes(terms):
"""
return a dict containing {prefix:[remainder,...]} mappings
singletons will always show up as {term:[""]}
"""
commons={}
for t in terms:
i=terms.index(t)
l=0
cc=""
for ot in terms[:i]+terms[i+1:]:
c=common(t,ot)
if len(c)>l:
l=len(c)
cc=c
rest=t[l:]
if cc=="":
commons[t]=[""]
elif commons.has_key(cc):
commons[cc].append(rest)
else:
commons[cc]=[rest]
for k in commons:
if len(commons[k])==1:
term=k+commons[k][0]
del(commons[k])
commons[term]=[""]
return commons
def prefix_terms(commons):
"""
turn a common prefixes dict into a list of terms
with the form prefix(rest1|rest2|...)
"""
terms_out=[]
for k in commons:
rests=commons[k]
if len(rests)==1: #singleton
term="%s%s"%(k,rests[0])
else:
term="%s(%s)"%(k,'|'.join(commons[k]))
terms_out.append(term)
terms_out.sort()
return terms_out
def diverge(a,b):
"find index of divergence of 2 similar strings"
l=min(len(a),len(b))
for i in range(l):
if a[i]!=b[i]:
return i
return l
def common(a,b):
"""
return the common substring of 2 strings with similar beginnings
end also at '('
"""
if len(a)>len(b):
shorter=b
else:
shorter=a
for i in range(len(shorter)):
if a[i]!=b[i] or a[i]=='(':
return a[:i]
return shorter
def before_and_after(terms):
mcs=most_common_substr(terms)
single=[]
ba=[]
for t in terms:
i=t.find(mcs)
if i==-1:
single.append(t)
else:
ba.append((t[:i],t[i+len(mcs):]))
return (single,ba)
def most_common_substr(terms):
"this function should be far more complex, not an ugly heuristic like this"
return "p5"
if __name__=='__main__':
main()