From eb357f820dfb0485543e052c6b26b4c5275d8f40 Mon Sep 17 00:00:00 2001 From: Paul Faure Date: Tue, 10 Mar 2026 22:52:31 -0400 Subject: [PATCH 1/2] perf(search): use O(1) DocID lookup instead of full search in IndexSpace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the per-file Search() call in IndexSpace with a direct Lookup() using Bleve's DocIDQuery. The old approach parsed a KQL query string, compiled it, and ran a full-text search for each file — taking 600-950ms per file on large indexes. The new approach does an O(1) document lookup by ID and compares mtime/Extracted fields in memory. Co-Authored-By: Claude Opus 4.6 --- .../fix-search-indexspace-docid-lookup.md | 11 ++++ services/search/pkg/engine/bleve.go | 12 ++++ services/search/pkg/engine/engine.go | 1 + services/search/pkg/engine/mocks/engine.go | 58 +++++++++++++++++++ services/search/pkg/search/service.go | 22 +++---- services/search/pkg/search/service_test.go | 2 +- 6 files changed, 95 insertions(+), 11 deletions(-) create mode 100644 changelog/unreleased/fix-search-indexspace-docid-lookup.md diff --git a/changelog/unreleased/fix-search-indexspace-docid-lookup.md b/changelog/unreleased/fix-search-indexspace-docid-lookup.md new file mode 100644 index 00000000000..5af524db4b5 --- /dev/null +++ b/changelog/unreleased/fix-search-indexspace-docid-lookup.md @@ -0,0 +1,11 @@ +Bugfix: Use O(1) document lookup instead of full search during reindexing + +The `IndexSpace` bulk reindexer was using a full KQL search query per file +to check whether re-extraction was needed. On large indexes this query +took 600–950ms each, making a 61,000-file space take ~13.5 hours just to +walk. Replaced the per-file `Search()` call with an O(1) `Lookup()` using +Bleve's `DocIDQuery`, then comparing mtime and extraction status in memory. +This reduces per-file check time from ~800ms to <1ms. + +https://github.com/owncloud/ocis/pull/12096 +https://github.com/owncloud/ocis/issues/12093 diff --git a/services/search/pkg/engine/bleve.go b/services/search/pkg/engine/bleve.go index a83b68f8b03..b1730bde815 100644 --- a/services/search/pkg/engine/bleve.go +++ b/services/search/pkg/engine/bleve.go @@ -315,6 +315,18 @@ func (b *Bleve) Update(id string, mutateFn func(*Resource)) error { return err } +// Lookup retrieves a resource by its document ID using an O(1) DocIDQuery. +// Returns ErrResourceNotFound if the resource is not in the index. +func (b *Bleve) Lookup(id string) (*Resource, error) { + bleveIndex, closeFn, err := b.indexGetter.GetIndex(bleveEngine.ReadOnly(true)) + if err != nil { + return nil, err + } + defer closeFn() + + return b.getResource(bleveIndex, id) +} + // Move updates the resource location and all of its necessary fields. func (b *Bleve) Move(id string, parentid string, target string) error { bleveIndex, closeFn, err := b.indexGetter.GetIndex() diff --git a/services/search/pkg/engine/engine.go b/services/search/pkg/engine/engine.go index c9602a27b5d..c336ab74a96 100644 --- a/services/search/pkg/engine/engine.go +++ b/services/search/pkg/engine/engine.go @@ -23,6 +23,7 @@ type Engine interface { Search(ctx context.Context, req *searchService.SearchIndexRequest) (*searchService.SearchIndexResponse, error) Upsert(id string, r Resource) error Update(id string, mutateFn func(*Resource)) error + Lookup(id string) (*Resource, error) Move(id string, parentid string, target string) error Delete(id string) error Restore(id string) error diff --git a/services/search/pkg/engine/mocks/engine.go b/services/search/pkg/engine/mocks/engine.go index 63f34639e5e..238e61f2b78 100644 --- a/services/search/pkg/engine/mocks/engine.go +++ b/services/search/pkg/engine/mocks/engine.go @@ -125,6 +125,64 @@ func (_c *Engine_DocCount_Call) RunAndReturn(run func() (uint64, error)) *Engine return _c } +// Lookup provides a mock function with given fields: id +func (_m *Engine) Lookup(id string) (*engine.Resource, error) { + ret := _m.Called(id) + + if len(ret) == 0 { + panic("no return value specified for Lookup") + } + + var r0 *engine.Resource + var r1 error + if rf, ok := ret.Get(0).(func(string) (*engine.Resource, error)); ok { + return rf(id) + } + if rf, ok := ret.Get(0).(func(string) *engine.Resource); ok { + r0 = rf(id) + } else { + if ret.Get(0) != nil { + r0 = ret.Get(0).(*engine.Resource) + } + } + + if rf, ok := ret.Get(1).(func(string) error); ok { + r1 = rf(id) + } else { + r1 = ret.Error(1) + } + + return r0, r1 +} + +// Engine_Lookup_Call is a *mock.Call that shadows Run/Return methods with type explicit version for method 'Lookup' +type Engine_Lookup_Call struct { + *mock.Call +} + +// Lookup is a helper method to define mock.On call +// - id string +func (_e *Engine_Expecter) Lookup(id interface{}) *Engine_Lookup_Call { + return &Engine_Lookup_Call{Call: _e.mock.On("Lookup", id)} +} + +func (_c *Engine_Lookup_Call) Run(run func(id string)) *Engine_Lookup_Call { + _c.Call.Run(func(args mock.Arguments) { + run(args[0].(string)) + }) + return _c +} + +func (_c *Engine_Lookup_Call) Return(_a0 *engine.Resource, _a1 error) *Engine_Lookup_Call { + _c.Call.Return(_a0, _a1) + return _c +} + +func (_c *Engine_Lookup_Call) RunAndReturn(run func(string) (*engine.Resource, error)) *Engine_Lookup_Call { + _c.Call.Return(run) + return _c +} + // Move provides a mock function with given fields: id, parentid, target func (_m *Engine) Move(id string, parentid string, target string) error { ret := _m.Called(id, parentid, target) diff --git a/services/search/pkg/search/service.go b/services/search/pkg/search/service.go index 48abbc752bc..578b14f5b9a 100644 --- a/services/search/pkg/search/service.go +++ b/services/search/pkg/search/service.go @@ -462,17 +462,19 @@ func (s *Service) IndexSpace(spaceID *provider.StorageSpaceId) error { } s.logger.Debug().Str("path", ref.Path).Msg("Walking tree") - searchRes, err := s.engine.Search(ownerCtx, &searchsvc.SearchIndexRequest{ - Query: "id:" + storagespace.FormatResourceID(info.Id) + ` mtime>=` + utils.TSToTime(info.Mtime).Format(time.RFC3339Nano) + ` Extracted:true`, - }) - - if err == nil && len(searchRes.Matches) >= 1 { - if info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER { - s.logger.Debug().Str("path", ref.Path).Msg("subtree hasn't changed. Skipping.") - return filepath.SkipDir + resourceID := storagespace.FormatResourceID(info.Id) + r, err := s.engine.Lookup(resourceID) + if err == nil && r.Extracted { + fileMtime := utils.TSToTime(info.Mtime) + docMtime, parseErr := time.Parse(time.RFC3339Nano, r.Mtime) + if parseErr == nil && !docMtime.Before(fileMtime) { + if info.Type == provider.ResourceType_RESOURCE_TYPE_CONTAINER { + s.logger.Debug().Str("path", ref.Path).Msg("subtree hasn't changed. Skipping.") + return filepath.SkipDir + } + s.logger.Debug().Str("path", ref.Path).Msg("element hasn't changed. Skipping.") + return nil } - s.logger.Debug().Str("path", ref.Path).Msg("element hasn't changed. Skipping.") - return nil } s.UpsertItem(ref) diff --git a/services/search/pkg/search/service_test.go b/services/search/pkg/search/service_test.go index 6e32914fa84..5f46f8e9f72 100644 --- a/services/search/pkg/search/service_test.go +++ b/services/search/pkg/search/service_test.go @@ -163,7 +163,7 @@ var _ = Describe("Searchprovider", func() { indexClient.On("Upsert", mock.Anything, mock.MatchedBy(func(r engine.Resource) bool { return r.ID == "storageid$spaceid!opaqueid" && r.Path == "./foo.pdf" })).Return(nil) - indexClient.On("Search", mock.Anything, mock.Anything).Return(&searchsvc.SearchIndexResponse{}, nil) + indexClient.On("Lookup", mock.Anything).Return(nil, engine.ErrResourceNotFound) gatewayClient.On("Stat", mock.Anything, mock.MatchedBy(func(sreq *sprovider.StatRequest) bool { return sreq.Ref.ResourceId.StorageId == "storageid" && sreq.Ref.ResourceId.OpaqueId == "spaceid" && From 0d95d40f7c44bde95dfbe671cb1670f53c37367b Mon Sep 17 00:00:00 2001 From: Paul Faure Date: Thu, 12 Mar 2026 20:37:41 -0400 Subject: [PATCH 2/2] docs: add doc comments to Lookup mock call methods Co-Authored-By: Claude Opus 4.6 --- services/search/pkg/engine/mocks/engine.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/services/search/pkg/engine/mocks/engine.go b/services/search/pkg/engine/mocks/engine.go index 238e61f2b78..0efee0b217d 100644 --- a/services/search/pkg/engine/mocks/engine.go +++ b/services/search/pkg/engine/mocks/engine.go @@ -166,6 +166,7 @@ func (_e *Engine_Expecter) Lookup(id interface{}) *Engine_Lookup_Call { return &Engine_Lookup_Call{Call: _e.mock.On("Lookup", id)} } +// Run sets a handler to be called when the Lookup mock is matched. func (_c *Engine_Lookup_Call) Run(run func(id string)) *Engine_Lookup_Call { _c.Call.Run(func(args mock.Arguments) { run(args[0].(string)) @@ -173,11 +174,13 @@ func (_c *Engine_Lookup_Call) Run(run func(id string)) *Engine_Lookup_Call { return _c } +// Return specifies the return values for the Lookup mock. func (_c *Engine_Lookup_Call) Return(_a0 *engine.Resource, _a1 error) *Engine_Lookup_Call { _c.Call.Return(_a0, _a1) return _c } +// RunAndReturn sets a handler that is called and whose return values are used as the mock's return values. func (_c *Engine_Lookup_Call) RunAndReturn(run func(string) (*engine.Resource, error)) *Engine_Lookup_Call { _c.Call.Return(run) return _c