Commit 4497960b authored by Robert Griesemer's avatar Robert Griesemer

godoc: full text index for whitelisted non-Go files

R=rsc
CC=golang-dev
https://golang.org/cl/4029046
parent c2ea38ac
......@@ -25,7 +25,6 @@ import (
"strings"
"template"
"time"
"utf8"
)
......@@ -56,7 +55,7 @@ var (
// TODO(gri) consider the invariant that goroot always end in '/'
goroot = flag.String("goroot", runtime.GOROOT(), "Go root directory")
testDir = flag.String("testdir", "", "Go root subdirectory - for testing only (faster startups)")
path = flag.String("path", "", "additional package directories (colon-separated)")
pkgPath = flag.String("path", "", "additional package directories (colon-separated)")
filter = flag.String("filter", "", "filter file containing permitted package directory paths")
filterMin = flag.Int("filter_minutes", 0, "filter file update interval in minutes; disabled if <= 0")
filterDelay delayTime // actual filter update interval in minutes; usually filterDelay == filterMin, but filterDelay may back off exponentially
......@@ -80,7 +79,7 @@ var (
func initHandlers() {
fsMap.Init(*path)
fsMap.Init(*pkgPath)
fileServer = http.FileServer(*goroot, "")
cmdHandler = httpHandler{"/cmd/", pathutil.Join(*goroot, "src/cmd"), false}
pkgHandler = httpHandler{"/pkg/", pathutil.Join(*goroot, "src/pkg"), true}
......@@ -768,53 +767,6 @@ func redirect(w http.ResponseWriter, r *http.Request) (redirected bool) {
}
// TODO(gri): Should have a mapping from extension to handler, eventually.
// textExt[x] is true if the extension x indicates a text file, and false otherwise.
var textExt = map[string]bool{
".css": false, // must be served raw
".js": false, // must be served raw
}
func isTextFile(path string) bool {
// if the extension is known, use it for decision making
if isText, found := textExt[pathutil.Ext(path)]; found {
return isText
}
// the extension is not known; read an initial chunk of
// file and check if it looks like correct UTF-8; if it
// does, it's probably a text file
f, err := os.Open(path, os.O_RDONLY, 0)
if err != nil {
return false
}
defer f.Close()
var buf [1024]byte
n, err := f.Read(buf[0:])
if err != nil {
return false
}
s := string(buf[0:n])
n -= utf8.UTFMax // make sure there's enough bytes for a complete unicode char
for i, c := range s {
if i > n {
break
}
if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' {
// decoding error or control character - not a text file
return false
}
}
// likely a text file
return true
}
func serveTextFile(w http.ResponseWriter, r *http.Request, abspath, relpath, title string) {
src, err := ioutil.ReadFile(abspath)
if err != nil {
......
......@@ -47,7 +47,7 @@ import (
"index/suffixarray"
"io/ioutil"
"os"
pathutil "path"
"path"
"regexp"
"sort"
"strings"
......@@ -430,8 +430,9 @@ func (a *AltWords) filter(s string) *AltWords {
// Indexer
// Adjust these flags as seems best.
const excludeMainPackages = false
const excludeTestFiles = false
const includeNonGoFiles = true
const includeMainPackages = true
const includeTestFiles = true
type IndexResult struct {
......@@ -619,11 +620,14 @@ func pkgName(filename string) string {
}
func (x *Indexer) addFile(filename string) *ast.File {
// addFile adds a file to the index if possible and returns the file set file
// and the file's AST if it was successfully parsed as a Go file. If addFile
// failed (that is, if the file was not added), it returns file == nil.
func (x *Indexer) addFile(filename string, goFile bool) (file *token.File, ast *ast.File) {
// open file
f, err := os.Open(filename, os.O_RDONLY, 0)
if err != nil {
return nil
return
}
defer f.Close()
......@@ -643,59 +647,126 @@ func (x *Indexer) addFile(filename string) *ast.File {
panic("internal error - file base incorrect")
}
// append file contents to x.sources
if _, err := x.sources.ReadFrom(f); err != nil {
x.sources.Truncate(base) // discard possibly added data
return nil // ignore files with I/O errors
}
// append file contents (src) to x.sources
if _, err := x.sources.ReadFrom(f); err == nil {
src := x.sources.Bytes()[base:]
if goFile {
// parse the file and in the process add it to the file set
src := x.sources.Bytes()[base:] // no need to reread the file
file, err := parser.ParseFile(x.fset, filename, src, parser.ParseComments)
if err != nil {
// do not discard the added source code in this case
// because the file has been added to the file set and
// the source size must match the file set base
// TODO(gri): given a FileSet.RemoveFile() one might be
// able to discard the data here (worthwhile?)
return nil // ignore files with (parse) errors
if ast, err = parser.ParseFile(x.fset, filename, src, parser.ParseComments); err == nil {
file = x.fset.File(ast.Pos()) // ast.Pos() is inside the file
return
}
// file has parse errors, and the AST may be incorrect -
// set lines information explicitly and index as ordinary
// text file (cannot fall through to the text case below
// because the file has already been added to the file set
// by the parser)
file = x.fset.File(token.Pos(base)) // token.Pos(base) is inside the file
file.SetLinesForContent(src)
ast = nil
return
}
return file
if isText(src) {
// only add the file to the file set (for the full text index)
file = x.fset.AddFile(filename, x.fset.Base(), len(src))
file.SetLinesForContent(src)
return
}
}
// discard possibly added data
x.sources.Truncate(base - 1) // -1 to remove added byte 0 since no file was added
return
}
// Design note: Using an explicit white list of permitted files for indexing
// makes sure that the important files are included and massively reduces the
// number of files to index. The advantage over a blacklist is that unexpected
// (non-blacklisted) files won't suddenly explode the index.
//
// TODO(gri): We may want to make this list customizable, perhaps via a flag.
// Files are whitelisted if they have a file name or extension
// present as key in whitelisted.
var whitelisted = map[string]bool{
".bash": true,
".c": true,
".css": true,
".go": true,
".goc": true,
".h": true,
".html": true,
".js": true,
".out": true,
".py": true,
".s": true,
".sh": true,
".txt": true,
".xml": true,
"AUTHORS": true,
"CONTRIBUTORS": true,
"LICENSE": true,
"Makefile": true,
"PATENTS": true,
"README": true,
}
// isWhitelisted returns true if a file is on the list
// of "permitted" files for indexing.
func isWhitelisted(filename string) bool {
key := path.Ext(filename)
if key == "" {
// file has no extension - use entire filename
key = filename
}
return whitelisted[key]
}
func (x *Indexer) visitFile(dirname string, f *os.FileInfo) {
if !isGoFile(f) {
if !f.IsRegular() {
return
}
path := pathutil.Join(dirname, f.Name)
if excludeTestFiles && (!isPkgFile(f) || strings.HasPrefix(path, "test/")) {
filename := path.Join(dirname, f.Name)
goFile := false
switch {
case isGoFile(f):
if !includeTestFiles && (!isPkgFile(f) || strings.HasPrefix(filename, "test/")) {
return
}
if !includeMainPackages && pkgName(filename) == "main" {
return
}
goFile = true
if excludeMainPackages && pkgName(path) == "main" {
case !includeNonGoFiles || !isWhitelisted(filename):
return
}
file := x.addFile(path)
file, fast := x.addFile(filename, goFile)
if file == nil {
return
return // addFile failed
}
// we've got a file to index
x.current = x.fset.File(file.Pos()) // file.Pos is in the current file
dir, _ := pathutil.Split(path)
pak := Pak{dir, file.Name.Name}
x.file = &File{path, pak}
ast.Walk(x, file)
if fast != nil {
// we've got a Go file to index
x.current = file
dir, _ := path.Split(filename)
pak := Pak{dir, fast.Name.Name}
x.file = &File{filename, pak}
ast.Walk(x, fast)
}
// update statistics
// (count real file size as opposed to using the padded x.sources.Len())
x.stats.Bytes += x.current.Size()
x.stats.Bytes += file.Size()
x.stats.Files++
x.stats.Lines += x.current.LineCount()
x.stats.Lines += file.LineCount()
}
......
......@@ -15,11 +15,13 @@ import (
"strings"
"sync"
"time"
"utf8"
)
// An RWValue wraps a value and permits mutually exclusive
// access to it and records the time the value was last set.
//
type RWValue struct {
mutex sync.RWMutex
value interface{}
......@@ -107,3 +109,63 @@ func writeFileAtomically(filename string, data []byte) os.Error {
}
return os.Rename(f.Name(), filename)
}
// isText returns true if a significant prefix of s looks like correct UTF-8;
// that is, if it is likely that s is human-readable text.
//
func isText(s []byte) bool {
const max = 1024 // at least utf8.UTFMax
if len(s) > max {
s = s[0:max]
}
for i, c := range string(s) {
if i+utf8.UTFMax > len(s) {
// last char may be incomplete - ignore
break
}
if c == 0xFFFD || c < ' ' && c != '\n' && c != '\t' {
// decoding error or control character - not a text file
return false
}
}
return true
}
// TODO(gri): Should have a mapping from extension to handler, eventually.
// textExt[x] is true if the extension x indicates a text file, and false otherwise.
var textExt = map[string]bool{
".css": false, // must be served raw
".js": false, // must be served raw
}
// isTextFile returns true if the file has a known extension indicating
// a text file, or if a significant chunk of the specified file looks like
// correct UTF-8; that is, if it is likely that the file contains human-
// readable text.
//
func isTextFile(filename string) bool {
// if the extension is known, use it for decision making
if isText, found := textExt[pathutil.Ext(filename)]; found {
return isText
}
// the extension is not known; read an initial chunk
// of the file and check if it looks like text
f, err := os.Open(filename, os.O_RDONLY, 0)
if err != nil {
return false
}
defer f.Close()
var buf [1024]byte
n, err := f.Read(buf[0:])
if err != nil {
return false
}
return isText(buf[0:n])
}
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment