Skip to content
Projects
Groups
Snippets
Help
Loading...
Sign in
Toggle navigation
G
golang
Project
Project
Details
Activity
Cycle Analytics
Repository
Repository
Files
Commits
Branches
Tags
Contributors
Graph
Compare
Charts
Issues
0
Issues
0
List
Board
Labels
Milestones
Merge Requests
0
Merge Requests
0
Wiki
Wiki
Snippets
Snippets
Members
Members
Collapse sidebar
Close sidebar
Activity
Graph
Charts
Create a new issue
Commits
Issue Boards
Open sidebar
go
golang
Commits
46e74660
Commit
46e74660
authored
Jun 27, 2011
by
Russ Cox
Browse files
Options
Browse Files
Download
Email Patches
Plain Diff
exp/regexp/syntax: compiled form
R=r, sam.thorogood, kevlar CC=golang-dev, rsc
https://golang.org/cl/4636046
parent
ebb1566a
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
539 additions
and
0 deletions
+539
-0
Makefile
src/pkg/exp/regexp/syntax/Makefile
+2
-0
compile.go
src/pkg/exp/regexp/syntax/compile.go
+264
-0
prog.go
src/pkg/exp/regexp/syntax/prog.go
+182
-0
prog_test.go
src/pkg/exp/regexp/syntax/prog_test.go
+91
-0
No files found.
src/pkg/exp/regexp/syntax/Makefile
View file @
46e74660
...
...
@@ -6,8 +6,10 @@ include ../../../../Make.inc
TARG
=
exp/regexp/syntax
GOFILES
=
\
compile.go
\
parse.go
\
perl_groups.go
\
prog.go
\
regexp.go
\
include
../../../../Make.pkg
src/pkg/exp/regexp/syntax/compile.go
0 → 100644
View file @
46e74660
package
syntax
import
(
"os"
"unicode"
)
// A patchList is a list of instruction pointers that need to be filled in (patched).
// Because the pointers haven't been filled in yet, we can reuse their storage
// to hold the list. It's kind of sleazy, but works well in practice.
// See http://swtch.com/~rsc/regexp/regexp1.html for inspiration.
//
// These aren't really pointers: they're integers, so we can reinterpret them
// this way without using package unsafe. A value l denotes
// p.inst[l>>1].Out (l&1==0) or .Arg (l&1==1).
// l == 0 denotes the empty list, okay because we start every program
// with a fail instruction, so we'll never want to point at its output link.
type
patchList
uint32
func
(
l
patchList
)
next
(
p
*
Prog
)
patchList
{
i
:=
&
p
.
Inst
[
l
>>
1
]
if
l
&
1
==
0
{
return
patchList
(
i
.
Out
)
}
return
patchList
(
i
.
Arg
)
}
func
(
l
patchList
)
patch
(
p
*
Prog
,
val
uint32
)
{
for
l
!=
0
{
i
:=
&
p
.
Inst
[
l
>>
1
]
if
l
&
1
==
0
{
l
=
patchList
(
i
.
Out
)
i
.
Out
=
val
}
else
{
l
=
patchList
(
i
.
Arg
)
i
.
Arg
=
val
}
}
}
func
(
l1
patchList
)
append
(
p
*
Prog
,
l2
patchList
)
patchList
{
if
l1
==
0
{
return
l2
}
if
l2
==
0
{
return
l1
}
last
:=
l1
for
{
next
:=
last
.
next
(
p
)
if
next
==
0
{
break
}
last
=
next
}
i
:=
&
p
.
Inst
[
last
>>
1
]
if
last
&
1
==
0
{
i
.
Out
=
uint32
(
l2
)
}
else
{
i
.
Arg
=
uint32
(
l2
)
}
return
l1
}
// A frag represents a compiled program fragment.
type
frag
struct
{
i
uint32
// index of first instruction
out
patchList
// where to record end instruction
}
type
compiler
struct
{
p
*
Prog
}
// Compile compiles the regexp into a program to be executed.
func
Compile
(
re
*
Regexp
)
(
*
Prog
,
os
.
Error
)
{
var
c
compiler
c
.
init
()
f
:=
c
.
compile
(
re
)
f
.
out
.
patch
(
c
.
p
,
c
.
inst
(
InstMatch
)
.
i
)
c
.
p
.
Start
=
int
(
f
.
i
)
return
c
.
p
,
nil
}
func
(
c
*
compiler
)
init
()
{
c
.
p
=
new
(
Prog
)
c
.
inst
(
InstFail
)
}
var
anyRuneNotNL
=
[]
int
{
0
,
'\n'
-
1
,
'\n'
-
1
,
unicode
.
MaxRune
}
var
anyRune
=
[]
int
{
0
,
unicode
.
MaxRune
}
func
(
c
*
compiler
)
compile
(
re
*
Regexp
)
frag
{
switch
re
.
Op
{
case
OpNoMatch
:
return
c
.
fail
()
case
OpEmptyMatch
:
return
c
.
nop
()
case
OpLiteral
:
if
len
(
re
.
Rune
)
==
0
{
return
c
.
nop
()
}
var
f
frag
for
j
:=
range
re
.
Rune
{
f1
:=
c
.
rune
(
re
.
Rune
[
j
:
j
+
1
])
if
j
==
0
{
f
=
f1
}
else
{
f
=
c
.
cat
(
f
,
f1
)
}
}
return
f
case
OpCharClass
:
return
c
.
rune
(
re
.
Rune
)
case
OpAnyCharNotNL
:
return
c
.
rune
(
anyRuneNotNL
)
case
OpAnyChar
:
return
c
.
rune
(
anyRune
)
case
OpBeginLine
:
return
c
.
empty
(
EmptyBeginLine
)
case
OpEndLine
:
return
c
.
empty
(
EmptyEndLine
)
case
OpBeginText
:
return
c
.
empty
(
EmptyBeginText
)
case
OpEndText
:
return
c
.
empty
(
EmptyEndText
)
case
OpWordBoundary
:
return
c
.
empty
(
EmptyWordBoundary
)
case
OpNoWordBoundary
:
return
c
.
empty
(
EmptyNoWordBoundary
)
case
OpCapture
:
bra
:=
c
.
cap
(
uint32
(
re
.
Cap
<<
1
))
sub
:=
c
.
compile
(
re
.
Sub
[
0
])
ket
:=
c
.
cap
(
uint32
(
re
.
Cap
<<
1
|
1
))
return
c
.
cat
(
c
.
cat
(
bra
,
sub
),
ket
)
case
OpStar
:
return
c
.
star
(
c
.
compile
(
re
.
Sub
[
0
]),
re
.
Flags
&
NonGreedy
!=
0
)
case
OpPlus
:
return
c
.
plus
(
c
.
compile
(
re
.
Sub
[
0
]),
re
.
Flags
&
NonGreedy
!=
0
)
case
OpQuest
:
return
c
.
quest
(
c
.
compile
(
re
.
Sub
[
0
]),
re
.
Flags
&
NonGreedy
!=
0
)
case
OpConcat
:
if
len
(
re
.
Sub
)
==
0
{
return
c
.
nop
()
}
var
f
frag
for
i
,
sub
:=
range
re
.
Sub
{
if
i
==
0
{
f
=
c
.
compile
(
sub
)
}
else
{
f
=
c
.
cat
(
f
,
c
.
compile
(
sub
))
}
}
return
f
case
OpAlternate
:
var
f
frag
for
_
,
sub
:=
range
re
.
Sub
{
f
=
c
.
alt
(
f
,
c
.
compile
(
sub
))
}
return
f
}
panic
(
"regexp: unhandled case in compile"
)
}
func
(
c
*
compiler
)
inst
(
op
InstOp
)
frag
{
// TODO: impose length limit
f
:=
frag
{
i
:
uint32
(
len
(
c
.
p
.
Inst
))}
c
.
p
.
Inst
=
append
(
c
.
p
.
Inst
,
Inst
{
Op
:
op
})
return
f
}
func
(
c
*
compiler
)
nop
()
frag
{
f
:=
c
.
inst
(
InstNop
)
f
.
out
=
patchList
(
f
.
i
<<
1
)
return
f
}
func
(
c
*
compiler
)
fail
()
frag
{
return
frag
{}
}
func
(
c
*
compiler
)
cap
(
arg
uint32
)
frag
{
f
:=
c
.
inst
(
InstCapture
)
f
.
out
=
patchList
(
f
.
i
<<
1
)
c
.
p
.
Inst
[
f
.
i
]
.
Arg
=
arg
return
f
}
func
(
c
*
compiler
)
cat
(
f1
,
f2
frag
)
frag
{
// concat of failure is failure
if
f1
.
i
==
0
||
f2
.
i
==
0
{
return
frag
{}
}
// TODO: elide nop
f1
.
out
.
patch
(
c
.
p
,
f2
.
i
)
return
frag
{
f1
.
i
,
f2
.
out
}
}
func
(
c
*
compiler
)
alt
(
f1
,
f2
frag
)
frag
{
// alt of failure is other
if
f1
.
i
==
0
{
return
f2
}
if
f2
.
i
==
0
{
return
f1
}
f
:=
c
.
inst
(
InstAlt
)
i
:=
&
c
.
p
.
Inst
[
f
.
i
]
i
.
Out
=
f1
.
i
i
.
Arg
=
f2
.
i
f
.
out
=
f1
.
out
.
append
(
c
.
p
,
f2
.
out
)
return
f
}
func
(
c
*
compiler
)
quest
(
f1
frag
,
nongreedy
bool
)
frag
{
f
:=
c
.
inst
(
InstAlt
)
i
:=
&
c
.
p
.
Inst
[
f
.
i
]
if
nongreedy
{
i
.
Arg
=
f1
.
i
f
.
out
=
patchList
(
f
.
i
<<
1
)
}
else
{
i
.
Out
=
f1
.
i
f
.
out
=
patchList
(
f
.
i
<<
1
|
1
)
}
f
.
out
=
f
.
out
.
append
(
c
.
p
,
f1
.
out
)
return
f
}
func
(
c
*
compiler
)
star
(
f1
frag
,
nongreedy
bool
)
frag
{
f
:=
c
.
inst
(
InstAlt
)
i
:=
&
c
.
p
.
Inst
[
f
.
i
]
if
nongreedy
{
i
.
Arg
=
f1
.
i
f
.
out
=
patchList
(
f
.
i
<<
1
)
}
else
{
i
.
Out
=
f1
.
i
f
.
out
=
patchList
(
f
.
i
<<
1
|
1
)
}
f1
.
out
.
patch
(
c
.
p
,
f
.
i
)
return
f
}
func
(
c
*
compiler
)
plus
(
f1
frag
,
nongreedy
bool
)
frag
{
return
frag
{
f1
.
i
,
c
.
star
(
f1
,
nongreedy
)
.
out
}
}
func
(
c
*
compiler
)
empty
(
op
EmptyOp
)
frag
{
f
:=
c
.
inst
(
InstEmptyWidth
)
c
.
p
.
Inst
[
f
.
i
]
.
Arg
=
uint32
(
op
)
f
.
out
=
patchList
(
f
.
i
<<
1
)
return
f
}
func
(
c
*
compiler
)
rune
(
rune
[]
int
)
frag
{
f
:=
c
.
inst
(
InstRune
)
c
.
p
.
Inst
[
f
.
i
]
.
Rune
=
rune
f
.
out
=
patchList
(
f
.
i
<<
1
)
return
f
}
src/pkg/exp/regexp/syntax/prog.go
0 → 100644
View file @
46e74660
package
syntax
import
(
"bytes"
"strconv"
)
// Compiled program.
// May not belong in this package, but convenient for now.
// A Prog is a compiled regular expression program.
type
Prog
struct
{
Inst
[]
Inst
Start
int
// index of start instruction
}
// An InstOp is an instruction opcode.
type
InstOp
uint8
const
(
InstAlt
InstOp
=
iota
InstAltMatch
InstCapture
InstEmptyWidth
InstMatch
InstFail
InstNop
InstRune
)
// An EmptyOp specifies a kind or mixture of zero-width assertions.
type
EmptyOp
uint8
const
(
EmptyBeginLine
EmptyOp
=
1
<<
iota
EmptyEndLine
EmptyBeginText
EmptyEndText
EmptyWordBoundary
EmptyNoWordBoundary
)
// An Inst is a single instruction in a regular expression program.
type
Inst
struct
{
Op
InstOp
Out
uint32
// all but InstMatch, InstFail
Arg
uint32
// InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
Rune
[]
int
}
func
(
p
*
Prog
)
String
()
string
{
var
b
bytes
.
Buffer
dumpProg
(
&
b
,
p
)
return
b
.
String
()
}
// MatchRune returns true if the instruction matches (and consumes) r.
// It should only be called when i.Op == InstRune.
func
(
i
*
Inst
)
MatchRune
(
r
int
)
bool
{
rune
:=
i
.
Rune
// Special case: single-rune slice is from literal string, not char class.
// TODO: Case folding.
if
len
(
rune
)
==
1
{
return
r
==
rune
[
0
]
}
// Peek at the first few pairs.
// Should handle ASCII well.
for
j
:=
0
;
j
<
len
(
rune
)
&&
j
<=
8
;
j
+=
2
{
if
r
<
rune
[
j
]
{
return
false
}
if
r
<=
rune
[
j
+
1
]
{
return
true
}
}
// Otherwise binary search.
lo
:=
0
hi
:=
len
(
rune
)
/
2
for
lo
<
hi
{
m
:=
lo
+
(
hi
-
lo
)
/
2
if
c
:=
rune
[
2
*
m
];
c
<=
r
{
if
r
<=
rune
[
2
*
m
+
1
]
{
return
true
}
lo
=
m
+
1
}
else
{
hi
=
m
}
}
return
false
}
// As per re2's Prog::IsWordChar. Determines whether rune is an ASCII word char.
// Since we act on runes, it would be easy to support Unicode here.
func
wordRune
(
rune
int
)
bool
{
return
rune
==
'_'
||
(
'A'
<=
rune
&&
rune
<=
'Z'
)
||
(
'a'
<=
rune
&&
rune
<=
'z'
)
||
(
'0'
<=
rune
&&
rune
<=
'9'
)
}
// MatchEmptyWidth returns true if the instruction matches
// an empty string between the runes before and after.
// It should only be called when i.Op == InstEmptyWidth.
func
(
i
*
Inst
)
MatchEmptyWidth
(
before
int
,
after
int
)
bool
{
switch
EmptyOp
(
i
.
Arg
)
{
case
EmptyBeginLine
:
return
before
==
'\n'
||
before
==
-
1
case
EmptyEndLine
:
return
after
==
'\n'
||
after
==
-
1
case
EmptyBeginText
:
return
before
==
-
1
case
EmptyEndText
:
return
after
==
-
1
case
EmptyWordBoundary
:
return
wordRune
(
before
)
!=
wordRune
(
after
)
case
EmptyNoWordBoundary
:
return
wordRune
(
before
)
==
wordRune
(
after
)
}
panic
(
"unknown empty width arg"
)
}
func
(
i
*
Inst
)
String
()
string
{
var
b
bytes
.
Buffer
dumpInst
(
&
b
,
i
)
return
b
.
String
()
}
func
bw
(
b
*
bytes
.
Buffer
,
args
...
string
)
{
for
_
,
s
:=
range
args
{
b
.
WriteString
(
s
)
}
}
func
dumpProg
(
b
*
bytes
.
Buffer
,
p
*
Prog
)
{
for
j
:=
range
p
.
Inst
{
i
:=
&
p
.
Inst
[
j
]
pc
:=
strconv
.
Itoa
(
j
)
if
len
(
pc
)
<
3
{
b
.
WriteString
(
" "
[
len
(
pc
)
:
])
}
if
j
==
p
.
Start
{
pc
+=
"*"
}
bw
(
b
,
pc
,
"
\t
"
)
dumpInst
(
b
,
i
)
bw
(
b
,
"
\n
"
)
}
}
func
u32
(
i
uint32
)
string
{
return
strconv
.
Uitoa64
(
uint64
(
i
))
}
func
dumpInst
(
b
*
bytes
.
Buffer
,
i
*
Inst
)
{
switch
i
.
Op
{
case
InstAlt
:
bw
(
b
,
"alt -> "
,
u32
(
i
.
Out
),
", "
,
u32
(
i
.
Arg
))
case
InstAltMatch
:
bw
(
b
,
"altmatch -> "
,
u32
(
i
.
Out
),
", "
,
u32
(
i
.
Arg
))
case
InstCapture
:
bw
(
b
,
"cap "
,
u32
(
i
.
Arg
),
" -> "
,
u32
(
i
.
Out
))
case
InstEmptyWidth
:
bw
(
b
,
"empty "
,
u32
(
i
.
Arg
),
" -> "
,
u32
(
i
.
Out
))
case
InstMatch
:
bw
(
b
,
"match"
)
case
InstFail
:
bw
(
b
,
"fail"
)
case
InstNop
:
bw
(
b
,
"nop -> "
,
u32
(
i
.
Out
))
case
InstRune
:
if
i
.
Rune
==
nil
{
// shouldn't happen
bw
(
b
,
"rune <nil>"
)
}
bw
(
b
,
"rune "
,
strconv
.
QuoteToASCII
(
string
(
i
.
Rune
)),
" -> "
,
u32
(
i
.
Out
))
}
}
src/pkg/exp/regexp/syntax/prog_test.go
0 → 100644
View file @
46e74660
package
syntax
import
(
"testing"
)
var
compileTests
=
[]
struct
{
Regexp
string
Prog
string
}{
{
"a"
,
` 0 fail
1* rune "a" -> 2
2 match
`
},
{
"[A-M][n-z]"
,
` 0 fail
1* rune "AM" -> 2
2 rune "nz" -> 3
3 match
`
},
{
""
,
` 0 fail
1* nop -> 2
2 match
`
},
{
"a?"
,
` 0 fail
1 rune "a" -> 3
2* alt -> 1, 3
3 match
`
},
{
"a??"
,
` 0 fail
1 rune "a" -> 3
2* alt -> 3, 1
3 match
`
},
{
"a+"
,
` 0 fail
1* rune "a" -> 2
2 alt -> 1, 3
3 match
`
},
{
"a+?"
,
` 0 fail
1* rune "a" -> 2
2 alt -> 3, 1
3 match
`
},
{
"a*"
,
` 0 fail
1 rune "a" -> 2
2* alt -> 1, 3
3 match
`
},
{
"a*?"
,
` 0 fail
1 rune "a" -> 2
2* alt -> 3, 1
3 match
`
},
{
"a+b+"
,
` 0 fail
1* rune "a" -> 2
2 alt -> 1, 3
3 rune "b" -> 4
4 alt -> 3, 5
5 match
`
},
{
"(a+)(b+)"
,
` 0 fail
1* cap 2 -> 2
2 rune "a" -> 3
3 alt -> 2, 4
4 cap 3 -> 5
5 cap 4 -> 6
6 rune "b" -> 7
7 alt -> 6, 8
8 cap 5 -> 9
9 match
`
},
{
"a+|b+"
,
` 0 fail
1 rune "a" -> 2
2 alt -> 1, 6
3 rune "b" -> 4
4 alt -> 3, 6
5* alt -> 1, 3
6 match
`
},
}
func
TestCompile
(
t
*
testing
.
T
)
{
for
_
,
tt
:=
range
compileTests
{
re
,
_
:=
Parse
(
tt
.
Regexp
,
Perl
)
p
,
_
:=
Compile
(
re
)
s
:=
p
.
String
()
if
s
!=
tt
.
Prog
{
t
.
Errorf
(
"compiled %#q:
\n
--- have
\n
%s---
\n
--- want
\n
%s---"
,
tt
.
Regexp
,
s
,
tt
.
Prog
)
}
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment