Commit 4497960b authored by Robert Griesemer's avatar Robert Griesemer

godoc: full text index for whitelisted non-Go files

R=rsc
CC=golang-dev
https://golang.org/cl/4029046
parent c2ea38ac
...@@ -25,7 +25,6 @@ import ( ...@@ -25,7 +25,6 @@ import (
"strings" "strings"
"template" "template"
"time" "time"
"utf8"
) )
...@@ -56,7 +55,7 @@ var ( ...@@ -56,7 +55,7 @@ var (
// TODO(gri) consider the invariant that goroot always end in '/' // TODO(gri) consider the invariant that goroot always end in '/'
goroot = flag.String("goroot", runtime.GOROOT(), "Go root directory") goroot = flag.String("goroot", runtime.GOROOT(), "Go root directory")
testDir = flag.String("testdir", "", "Go root subdirectory - for testing only (faster startups)") testDir = flag.String("testdir", "", "Go root subdirectory - for testing only (faster startups)")
path = flag.String("path", "", "additional package directories (colon-separated)") pkgPath = flag.String("path", "", "additional package directories (colon-separated)")
filter = flag.String("filter", "", "filter file containing permitted package directory paths") filter = flag.String("filter", "", "filter file containing permitted package directory paths")
filterMin = flag.Int("filter_minutes", 0, "filter file update interval in minutes; disabled if <= 0") filterMin = flag.Int("filter_minutes", 0, "filter file update interval in minutes; disabled if <= 0")
filterDelay delayTime // actual filter update interval in minutes; usually filterDelay == filterMin, but filterDelay may back off exponentially filterDelay delayTime // actual filter update interval in minutes; usually filterDelay == filterMin, but filterDelay may back off exponentially
...@@ -80,7 +79,7 @@ var ( ...@@ -80,7 +79,7 @@ var (
func initHandlers() { func initHandlers() {
fsMap.Init(*path) fsMap.Init(*pkgPath)
fileServer = http.FileServer(*goroot, "") fileServer = http.FileServer(*goroot, "")
cmdHandler = httpHandler{"/cmd/", pathutil.Join(*goroot, "src/cmd"), false} cmdHandler = httpHandler{"/cmd/", pathutil.Join(*goroot, "src/cmd"), false}
pkgHandler = httpHandler{"/pkg/", pathutil.Join(*goroot, "src/pkg"), true} pkgHandler = httpHandler{"/pkg/", pathutil.Join(*goroot, "src/pkg"), true}
...@@ -768,53 +767,6 @@ func redirect(w http.ResponseWriter, r *http.Request) (redirected bool) { ...@@ -768,53 +767,6 @@ func redirect(w http.ResponseWriter, r *http.Request) (redirected bool) {
} }
// TODO(gri): Should have a mapping from extension to handler, eventually.
// textExt[x] is true if the extension x indicates a text file, and false otherwise.
var textExt = map[string]bool{
".css": false, // must be served raw
".js": false, // must be served raw
}
func isTextFile(path string) bool {
// if the extension is known, use it for decision making
if isText, found := textExt[pathutil.Ext(path)]; found {
return isText
}
// the extension is not known; read an initial chunk of
// file and check if it looks like correct UTF-8; if it
// does, it's probably a text file
f, err := os.Open(path, os.O_RDONLY, 0)
if err != nil {
return false
}
defer f.Close()
var buf [1024]byte
n, err := f.Read(buf[0:])
if err != nil {
return false
}
s := string(buf[0:n])
n -= utf8.UTFMax // make sure there's enough bytes for a complete unicode char
for i, c := range s {
if i > n {
break
}
if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' {
// decoding error or control character - not a text file
return false
}
}
// likely a text file
return true
}
func serveTextFile(w http.ResponseWriter, r *http.Request, abspath, relpath, title string) { func serveTextFile(w http.ResponseWriter, r *http.Request, abspath, relpath, title string) {
src, err := ioutil.ReadFile(abspath) src, err := ioutil.ReadFile(abspath)
if err != nil { if err != nil {
......
...@@ -47,7 +47,7 @@ import ( ...@@ -47,7 +47,7 @@ import (
"index/suffixarray" "index/suffixarray"
"io/ioutil" "io/ioutil"
"os" "os"
pathutil "path" "path"
"regexp" "regexp"
"sort" "sort"
"strings" "strings"
...@@ -430,8 +430,9 @@ func (a *AltWords) filter(s string) *AltWords { ...@@ -430,8 +430,9 @@ func (a *AltWords) filter(s string) *AltWords {
// Indexer // Indexer
// Adjust these flags as seems best. // Adjust these flags as seems best.
const excludeMainPackages = false const includeNonGoFiles = true
const excludeTestFiles = false const includeMainPackages = true
const includeTestFiles = true
type IndexResult struct { type IndexResult struct {
...@@ -619,11 +620,14 @@ func pkgName(filename string) string { ...@@ -619,11 +620,14 @@ func pkgName(filename string) string {
} }
func (x *Indexer) addFile(filename string) *ast.File { // addFile adds a file to the index if possible and returns the file set file
// and the file's AST if it was successfully parsed as a Go file. If addFile
// failed (that is, if the file was not added), it returns file == nil.
func (x *Indexer) addFile(filename string, goFile bool) (file *token.File, ast *ast.File) {
// open file // open file
f, err := os.Open(filename, os.O_RDONLY, 0) f, err := os.Open(filename, os.O_RDONLY, 0)
if err != nil { if err != nil {
return nil return
} }
defer f.Close() defer f.Close()
...@@ -643,59 +647,126 @@ func (x *Indexer) addFile(filename string) *ast.File { ...@@ -643,59 +647,126 @@ func (x *Indexer) addFile(filename string) *ast.File {
panic("internal error - file base incorrect") panic("internal error - file base incorrect")
} }
// append file contents to x.sources // append file contents (src) to x.sources
if _, err := x.sources.ReadFrom(f); err != nil { if _, err := x.sources.ReadFrom(f); err == nil {
x.sources.Truncate(base) // discard possibly added data src := x.sources.Bytes()[base:]
return nil // ignore files with I/O errors
}
if goFile {
// parse the file and in the process add it to the file set // parse the file and in the process add it to the file set
src := x.sources.Bytes()[base:] // no need to reread the file if ast, err = parser.ParseFile(x.fset, filename, src, parser.ParseComments); err == nil {
file, err := parser.ParseFile(x.fset, filename, src, parser.ParseComments) file = x.fset.File(ast.Pos()) // ast.Pos() is inside the file
if err != nil { return
// do not discard the added source code in this case }
// because the file has been added to the file set and // file has parse errors, and the AST may be incorrect -
// the source size must match the file set base // set lines information explicitly and index as ordinary
// TODO(gri): given a FileSet.RemoveFile() one might be // text file (cannot fall through to the text case below
// able to discard the data here (worthwhile?) // because the file has already been added to the file set
return nil // ignore files with (parse) errors // by the parser)
file = x.fset.File(token.Pos(base)) // token.Pos(base) is inside the file
file.SetLinesForContent(src)
ast = nil
return
} }
return file if isText(src) {
// only add the file to the file set (for the full text index)
file = x.fset.AddFile(filename, x.fset.Base(), len(src))
file.SetLinesForContent(src)
return
}
}
// discard possibly added data
x.sources.Truncate(base - 1) // -1 to remove added byte 0 since no file was added
return
}
// Design note: Using an explicit white list of permitted files for indexing
// makes sure that the important files are included and massively reduces the
// number of files to index. The advantage over a blacklist is that unexpected
// (non-blacklisted) files won't suddenly explode the index.
//
// TODO(gri): We may want to make this list customizable, perhaps via a flag.
// Files are whitelisted if they have a file name or extension
// present as key in whitelisted.
var whitelisted = map[string]bool{
".bash": true,
".c": true,
".css": true,
".go": true,
".goc": true,
".h": true,
".html": true,
".js": true,
".out": true,
".py": true,
".s": true,
".sh": true,
".txt": true,
".xml": true,
"AUTHORS": true,
"CONTRIBUTORS": true,
"LICENSE": true,
"Makefile": true,
"PATENTS": true,
"README": true,
}
// isWhitelisted returns true if a file is on the list
// of "permitted" files for indexing.
func isWhitelisted(filename string) bool {
key := path.Ext(filename)
if key == "" {
// file has no extension - use entire filename
key = filename
}
return whitelisted[key]
} }
func (x *Indexer) visitFile(dirname string, f *os.FileInfo) { func (x *Indexer) visitFile(dirname string, f *os.FileInfo) {
if !isGoFile(f) { if !f.IsRegular() {
return return
} }
path := pathutil.Join(dirname, f.Name) filename := path.Join(dirname, f.Name)
if excludeTestFiles && (!isPkgFile(f) || strings.HasPrefix(path, "test/")) { goFile := false
switch {
case isGoFile(f):
if !includeTestFiles && (!isPkgFile(f) || strings.HasPrefix(filename, "test/")) {
return
}
if !includeMainPackages && pkgName(filename) == "main" {
return return
} }
goFile = true
if excludeMainPackages && pkgName(path) == "main" { case !includeNonGoFiles || !isWhitelisted(filename):
return return
} }
file := x.addFile(path) file, fast := x.addFile(filename, goFile)
if file == nil { if file == nil {
return return // addFile failed
} }
// we've got a file to index if fast != nil {
x.current = x.fset.File(file.Pos()) // file.Pos is in the current file // we've got a Go file to index
dir, _ := pathutil.Split(path) x.current = file
pak := Pak{dir, file.Name.Name} dir, _ := path.Split(filename)
x.file = &File{path, pak} pak := Pak{dir, fast.Name.Name}
ast.Walk(x, file) x.file = &File{filename, pak}
ast.Walk(x, fast)
}
// update statistics // update statistics
// (count real file size as opposed to using the padded x.sources.Len()) x.stats.Bytes += file.Size()
x.stats.Bytes += x.current.Size()
x.stats.Files++ x.stats.Files++
x.stats.Lines += x.current.LineCount() x.stats.Lines += file.LineCount()
} }
......
...@@ -15,11 +15,13 @@ import ( ...@@ -15,11 +15,13 @@ import (
"strings" "strings"
"sync" "sync"
"time" "time"
"utf8"
) )
// An RWValue wraps a value and permits mutually exclusive // An RWValue wraps a value and permits mutually exclusive
// access to it and records the time the value was last set. // access to it and records the time the value was last set.
//
type RWValue struct { type RWValue struct {
mutex sync.RWMutex mutex sync.RWMutex
value interface{} value interface{}
...@@ -107,3 +109,63 @@ func writeFileAtomically(filename string, data []byte) os.Error { ...@@ -107,3 +109,63 @@ func writeFileAtomically(filename string, data []byte) os.Error {
} }
return os.Rename(f.Name(), filename) return os.Rename(f.Name(), filename)
} }
// isText returns true if a significant prefix of s looks like correct UTF-8;
// that is, if it is likely that s is human-readable text.
//
func isText(s []byte) bool {
const max = 1024 // at least utf8.UTFMax
if len(s) > max {
s = s[0:max]
}
for i, c := range string(s) {
if i+utf8.UTFMax > len(s) {
// last char may be incomplete - ignore
break
}
if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' {
// decoding error or control character - not a text file
return false
}
}
return true
}
// TODO(gri): Should have a mapping from extension to handler, eventually.
// textExt[x] is true if the extension x indicates a text file, and false otherwise.
var textExt = map[string]bool{
".css": false, // must be served raw
".js": false, // must be served raw
}
// isTextFile returns true if the file has a known extension indicating
// a text file, or if a significant chunk of the specified file looks like
// correct UTF-8; that is, if it is likely that the file contains human-
// readable text.
//
func isTextFile(filename string) bool {
// if the extension is known, use it for decision making
if isText, found := textExt[pathutil.Ext(filename)]; found {
return isText
}
// the extension is not known; read an initial chunk
// of the file and check if it looks like text
f, err := os.Open(filename, os.O_RDONLY, 0)
if err != nil {
return false
}
defer f.Close()
var buf [1024]byte
n, err := f.Read(buf[0:])
if err != nil {
return false
}
return isText(buf[0:n])
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment