Merge pull request #12103 from paul43210/fix/search-indexspace-docid-lookup

perf(search): use O(1) DocID lookup instead of full search in IndexSpace
This commit is contained in:
Martin
2026-03-13 09:57:18 +01:00
committed by GitHub
6 changed files with 98 additions and 11 deletions

View File

@@ -0,0 +1,11 @@
Bugfix: Use O(1) document lookup instead of full search during reindexing
The `IndexSpace` bulk reindexer was using a full KQL search query per file
to check whether re-extraction was needed. On large indexes this query
took 600950ms each, making a 61,000-file space take ~13.5 hours just to
walk. Replaced the per-file `Search()` call with an O(1) `Lookup()` using
Bleve's `DocIDQuery`, then comparing mtime and extraction status in memory.
This reduces per-file check time from ~800ms to <1ms.
https://github.com/owncloud/ocis/pull/12096
https://github.com/owncloud/ocis/issues/12093

View File

@@ -317,6 +317,18 @@ func (b *Bleve) Update(id string, mutateFn func(*Resource)) error {
return err
}
// Lookup retrieves a resource by its document ID using an O(1) DocIDQuery.
// Returns ErrResourceNotFound if the resource is not in the index.
func (b *Bleve) Lookup(id string) (*Resource, error) {
bleveIndex, closeFn, err := b.indexGetter.GetIndex(bleveEngine.ReadOnly(true))
if err != nil {
return nil, err
}
defer closeFn()
return b.getResource(bleveIndex, id)
}
// Move updates the resource location and all of its necessary fields.
func (b *Bleve) Move(id string, parentid string, target string) error {
bleveIndex, closeFn, err := b.indexGetter.GetIndex()

View File

@@ -23,6 +23,7 @@ type Engine interface {
Search(ctx context.Context, req *searchService.SearchIndexRequest) (*searchService.SearchIndexResponse, error)
Upsert(id string, r Resource) error
Update(id string, mutateFn func(*Resource)) error
Lookup(id string) (*Resource, error)
Move(id string, parentid string, target string) error
Delete(id string) error
Restore(id string) error

View File

@@ -125,6 +125,67 @@ func (_c *Engine_DocCount_Call) RunAndReturn(run func() (uint64, error)) *Engine
return _c
}
// Lookup provides a mock function with given fields: id
func (_m *Engine) Lookup(id string) (*engine.Resource, error) {
ret := _m.Called(id)
if len(ret) == 0 {
panic("no return value specified for Lookup")
}
var r0 *engine.Resource
var r1 error
if rf, ok := ret.Get(0).(func(string) (*engine.Resource, error)); ok {
return rf(id)
}
if rf, ok := ret.Get(0).(func(string) *engine.Resource); ok {
r0 = rf(id)
} else {
if ret.Get(0) != nil {
r0 = ret.Get(0).(*engine.Resource)
}
}
if rf, ok := ret.Get(1).(func(string) error); ok {
r1 = rf(id)
} else {
r1 = ret.Error(1)
}
return r0, r1
}
// Engine_Lookup_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Lookup'
type Engine_Lookup_Call struct {
*mock.Call
}
// Lookup is a helper method to define mock.On call
// - id string
func (_e *Engine_Expecter) Lookup(id interface{}) *Engine_Lookup_Call {
return &Engine_Lookup_Call{Call: _e.mock.On("Lookup", id)}
}
// Run sets a handler to be called when the Lookup mock is matched.
func (_c *Engine_Lookup_Call) Run(run func(id string)) *Engine_Lookup_Call {
_c.Call.Run(func(args mock.Arguments) {
run(args[0].(string))
})
return _c
}
// Return specifies the return values for the Lookup mock.
func (_c *Engine_Lookup_Call) Return(_a0 *engine.Resource, _a1 error) *Engine_Lookup_Call {
_c.Call.Return(_a0, _a1)
return _c
}
// RunAndReturn sets a handler that is called and whose return values are used as the mock's return values.
func (_c *Engine_Lookup_Call) RunAndReturn(run func(string) (*engine.Resource, error)) *Engine_Lookup_Call {
_c.Call.Return(run)
return _c
}
// Move provides a mock function with given fields: id, parentid, target
func (_m *Engine) Move(id string, parentid string, target string) error {
ret := _m.Called(id, parentid, target)

View File

@@ -462,17 +462,19 @@ func (s *Service) IndexSpace(spaceID *provider.StorageSpaceId) error {
}
s.logger.Debug().Str("path", ref.Path).Msg("Walking tree")
searchRes, err := s.engine.Search(ownerCtx, &searchsvc.SearchIndexRequest{
Query: "id:" + storagespace.FormatResourceID(info.Id) + ` mtime>=` + utils.TSToTime(info.Mtime).Format(time.RFC3339Nano) + ` Extracted:true`,
})
if err == nil && len(searchRes.Matches) >= 1 {
if info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER {
s.logger.Debug().Str("path", ref.Path).Msg("subtree hasn't changed. Skipping.")
return filepath.SkipDir
resourceID := storagespace.FormatResourceID(info.Id)
r, err := s.engine.Lookup(resourceID)
if err == nil && r.Extracted {
fileMtime := utils.TSToTime(info.Mtime)
docMtime, parseErr := time.Parse(time.RFC3339Nano, r.Mtime)
if parseErr == nil && !docMtime.Before(fileMtime) {
if info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER {
s.logger.Debug().Str("path", ref.Path).Msg("subtree hasn't changed. Skipping.")
return filepath.SkipDir
}
s.logger.Debug().Str("path", ref.Path).Msg("element hasn't changed. Skipping.")
return nil
}
s.logger.Debug().Str("path", ref.Path).Msg("element hasn't changed. Skipping.")
return nil
}
s.UpsertItem(ref)

View File

@@ -163,7 +163,7 @@ var _ = Describe("Searchprovider", func() {
indexClient.On("Upsert", mock.Anything, mock.MatchedBy(func(r engine.Resource) bool {
return r.ID == "storageid$spaceid!opaqueid" && r.Path == "./foo.pdf"
})).Return(nil)
indexClient.On("Search", mock.Anything, mock.Anything).Return(&searchsvc.SearchIndexResponse{}, nil)
indexClient.On("Lookup", mock.Anything).Return(nil, engine.ErrResourceNotFound)
gatewayClient.On("Stat", mock.Anything, mock.MatchedBy(func(sreq *sprovider.StatRequest) bool {
return sreq.Ref.ResourceId.StorageId == "storageid" &&
sreq.Ref.ResourceId.OpaqueId == "spaceid" &&