mirror of
https://github.com/owncloud/ocis
synced 2026-04-25 17:25:21 +02:00
Merge pull request #12119 from paul43210/fix/search-skipdir-reindex
fix(search): always descend into directories during IndexSpace walk
This commit is contained in:
11
changelog/unreleased/fix-search-skipdir-reindex.md
Normal file
11
changelog/unreleased/fix-search-skipdir-reindex.md
Normal file
@@ -0,0 +1,11 @@
|
||||
Bugfix: Always descend into directories during space reindexing
|
||||
|
||||
The search indexer's `IndexSpace` walk previously used `filepath.SkipDir`
|
||||
to skip entire directory subtrees when the directory itself was already
|
||||
indexed. After a failed or interrupted indexing run (e.g. Tika crash),
|
||||
this caused thousands of unindexed files to be permanently skipped
|
||||
because the parent directory's mtime had not changed. The indexer now
|
||||
always descends into directories, relying on the O(1) per-file DocID
|
||||
lookup to skip already-indexed files efficiently.
|
||||
|
||||
https://github.com/owncloud/ocis/pull/12119
|
||||
@@ -469,8 +469,11 @@ func (s *Service) IndexSpace(spaceID *provider.StorageSpaceId) error {
|
||||
docMtime, parseErr := time.Parse(time.RFC3339Nano, r.Mtime)
|
||||
if parseErr == nil && !docMtime.Before(fileMtime) {
|
||||
if info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER {
|
||||
s.logger.Debug().Str("path", ref.Path).Msg("subtree hasn't changed. Skipping.")
|
||||
return filepath.SkipDir
|
||||
// Always descend into directories — a directory's
|
||||
// mtime only reflects direct child add/remove, not
|
||||
// whether all children were successfully indexed.
|
||||
// With O(1) DocID lookups the cost is negligible.
|
||||
return nil
|
||||
}
|
||||
s.logger.Debug().Str("path", ref.Path).Msg("element hasn't changed. Skipping.")
|
||||
return nil
|
||||
|
||||
@@ -3,6 +3,9 @@ package search_test
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
gateway "github.com/cs3org/go-cs3apis/cs3/gateway/v1beta1"
|
||||
userv1beta1 "github.com/cs3org/go-cs3apis/cs3/identity/user/v1beta1"
|
||||
@@ -200,6 +203,139 @@ var _ = Describe("Searchprovider", func() {
|
||||
err := s.IndexSpace(&sprovider.StorageSpaceId{OpaqueId: "storageid$spaceid!spaceid"})
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
})
|
||||
|
||||
It("descends into already-indexed directories to index unvisited children", func() {
|
||||
// Simulate a large tree where all directories are already indexed
|
||||
// (mtime matches) but each contains one unindexed file — e.g. after
|
||||
// a Tika crash interrupted a previous reindex run. The fix ensures
|
||||
// we never SkipDir for directories, so children are always visited.
|
||||
//
|
||||
// 10,000 directories: pessimistic-case benchmark confirming the walker
|
||||
// scales linearly even when every directory is already indexed.
|
||||
const numDirs = 10000
|
||||
|
||||
gatewayClient.On("GetUserByClaim", mock.Anything, mock.Anything).Return(&userv1beta1.GetUserByClaimResponse{
|
||||
Status: status.NewOK(context.Background()),
|
||||
User: user,
|
||||
}, nil)
|
||||
extractor.On("Extract", mock.Anything, mock.Anything, mock.Anything).Return(content.Document{}, nil)
|
||||
|
||||
// The indexed mtime that the engine returns for directories.
|
||||
indexedMtime := time.Unix(5000, 0).UTC().Format(time.RFC3339Nano)
|
||||
|
||||
// Build root: a container that lists numDirs child directories.
|
||||
rootInfo := &sprovider.ResourceInfo{
|
||||
Id: &sprovider.ResourceId{StorageId: "storageid", SpaceId: "spaceid", OpaqueId: "spaceid"},
|
||||
Type: sprovider.ResourceType_RESOURCE_TYPE_CONTAINER,
|
||||
Path: ".",
|
||||
Mtime: &typesv1beta1.Timestamp{Seconds: 5000},
|
||||
}
|
||||
dirInfos := make([]*sprovider.ResourceInfo, numDirs)
|
||||
for i := 0; i < numDirs; i++ {
|
||||
dirInfos[i] = &sprovider.ResourceInfo{
|
||||
Id: &sprovider.ResourceId{StorageId: "storageid", SpaceId: "spaceid", OpaqueId: fmt.Sprintf("dir-%d", i)},
|
||||
Type: sprovider.ResourceType_RESOURCE_TYPE_CONTAINER,
|
||||
Path: fmt.Sprintf("dir-%d", i),
|
||||
Mtime: &typesv1beta1.Timestamp{Seconds: 5000},
|
||||
}
|
||||
}
|
||||
|
||||
// Pre-build file infos for each directory (map for O(1) dispatch).
|
||||
fileInfoByDir := make(map[string]*sprovider.ResourceInfo, numDirs)
|
||||
for i := 0; i < numDirs; i++ {
|
||||
dirID := fmt.Sprintf("dir-%d", i)
|
||||
fileInfoByDir[dirID] = &sprovider.ResourceInfo{
|
||||
Id: &sprovider.ResourceId{StorageId: "storageid", SpaceId: "spaceid", OpaqueId: fmt.Sprintf("file-%d", i)},
|
||||
ParentId: &sprovider.ResourceId{StorageId: "storageid", SpaceId: "spaceid", OpaqueId: dirID},
|
||||
Type: sprovider.ResourceType_RESOURCE_TYPE_FILE,
|
||||
Path: fmt.Sprintf("file-%d.pdf", i),
|
||||
Size: 1024,
|
||||
Mtime: &typesv1beta1.Timestamp{Seconds: 5000},
|
||||
}
|
||||
}
|
||||
|
||||
// Build path-to-fileInfo map for O(1) Stat dispatch.
|
||||
fileInfoByPath := make(map[string]*sprovider.ResourceInfo, numDirs)
|
||||
for dirID, fi := range fileInfoByDir {
|
||||
fileInfoByPath["./"+dirID+"/"+fi.Path] = fi
|
||||
}
|
||||
|
||||
// Single Stat mock dispatching via map — avoids 10k separate matchers.
|
||||
gatewayClient.On("Stat", mock.Anything, mock.Anything).Return(
|
||||
func(_ context.Context, sreq *sprovider.StatRequest, _ ...grpc.CallOption) *sprovider.StatResponse {
|
||||
if sreq.Ref.ResourceId != nil && sreq.Ref.ResourceId.OpaqueId == "spaceid" {
|
||||
return &sprovider.StatResponse{
|
||||
Status: status.NewOK(context.Background()),
|
||||
Info: rootInfo,
|
||||
}
|
||||
}
|
||||
if fi, ok := fileInfoByPath[sreq.Ref.Path]; ok {
|
||||
return &sprovider.StatResponse{
|
||||
Status: status.NewOK(context.Background()),
|
||||
Info: fi,
|
||||
}
|
||||
}
|
||||
return &sprovider.StatResponse{Status: status.NewOK(context.Background()), Info: rootInfo}
|
||||
},
|
||||
func(_ context.Context, _ *sprovider.StatRequest, _ ...grpc.CallOption) error {
|
||||
return nil
|
||||
},
|
||||
)
|
||||
|
||||
// Single ListContainer mock dispatching via map.
|
||||
gatewayClient.On("ListContainer", mock.Anything, mock.Anything).Return(
|
||||
func(_ context.Context, req *sprovider.ListContainerRequest, _ ...grpc.CallOption) *sprovider.ListContainerResponse {
|
||||
opaqueID := req.Ref.ResourceId.OpaqueId
|
||||
if opaqueID == "spaceid" {
|
||||
return &sprovider.ListContainerResponse{
|
||||
Status: status.NewOK(context.Background()),
|
||||
Infos: dirInfos,
|
||||
}
|
||||
}
|
||||
if fi, ok := fileInfoByDir[opaqueID]; ok {
|
||||
return &sprovider.ListContainerResponse{
|
||||
Status: status.NewOK(context.Background()),
|
||||
Infos: []*sprovider.ResourceInfo{fi},
|
||||
}
|
||||
}
|
||||
return &sprovider.ListContainerResponse{
|
||||
Status: status.NewOK(context.Background()),
|
||||
Infos: nil,
|
||||
}
|
||||
},
|
||||
func(_ context.Context, _ *sprovider.ListContainerRequest, _ ...grpc.CallOption) error {
|
||||
return nil
|
||||
},
|
||||
)
|
||||
|
||||
// Lookup: root and directories are already indexed with matching mtime.
|
||||
// Use a map for O(1) lookup instead of linear scan (matters at 10k dirs).
|
||||
indexedIDs := make(map[string]bool, numDirs+1)
|
||||
indexedIDs["storageid$spaceid!spaceid"] = true
|
||||
for i := 0; i < numDirs; i++ {
|
||||
indexedIDs[fmt.Sprintf("storageid$spaceid!dir-%d", i)] = true
|
||||
}
|
||||
indexedResource := &engine.Resource{Document: content.Document{Mtime: indexedMtime}, Extracted: true}
|
||||
indexClient.On("Lookup", mock.MatchedBy(func(id string) bool {
|
||||
return indexedIDs[id] || len(id) > 18 // dir-N or file-N IDs
|
||||
})).Return(func(id string) (*engine.Resource, error) {
|
||||
if indexedIDs[id] {
|
||||
return indexedResource, nil
|
||||
}
|
||||
// Files are not indexed.
|
||||
return nil, engine.ErrResourceNotFound
|
||||
})
|
||||
|
||||
// Track Upsert calls — each unindexed file should trigger one.
|
||||
var upsertCount atomic.Int32
|
||||
indexClient.On("Upsert", mock.Anything, mock.Anything).Run(func(_ mock.Arguments) {
|
||||
upsertCount.Add(1)
|
||||
}).Return(nil)
|
||||
|
||||
err := s.IndexSpace(&sprovider.StorageSpaceId{OpaqueId: "storageid$spaceid!spaceid"})
|
||||
Expect(err).ShouldNot(HaveOccurred())
|
||||
Expect(int(upsertCount.Load())).To(Equal(numDirs), "every unindexed file inside an already-indexed directory must be visited")
|
||||
})
|
||||
})
|
||||
|
||||
Describe("UpdateTags", func() {
|
||||
|
||||
Reference in New Issue
Block a user