Merge pull request #12095 from paul43210/fix/search-incomplete-extraction

fix(search): prevent incomplete Tika extractions from blocking re-index
This commit is contained in:
Martin
2026-03-10 09:18:11 +01:00
committed by GitHub
5 changed files with 50 additions and 17 deletions

View File

@@ -0,0 +1,26 @@
Bugfix: Prevent incomplete Tika extractions from permanently blocking re-index
When Tika returned HTTP 200 but its child processes (OCR, ImageMagick)
failed due to resource limits, the search index received metadata but
no content. The document was written to Bleve with the correct mtime,
and subsequent reindexes skipped it because the id+mtime check passed.
This left files permanently stuck as "indexed" with no searchable
content.
Two fixes are applied:
1. Validate Tika responses: if `MetaRecursive()` returns an empty
metadata list, it is now treated as an extraction error so the
document is not written to the index.
2. Add an `Extracted` field to indexed resources. It is set to `true`
only after successful extraction. The reindex skip check now requires
`Extracted:true`, so incompletely indexed documents are automatically
re-processed on the next reindex run.
Note: existing search indexes will trigger a full re-extraction on the
next reindex because documents written before this change lack the
`Extracted` field.
https://github.com/owncloud/ocis/pull/12095
https://github.com/owncloud/ocis/issues/12093

View File

@@ -87,6 +87,10 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
return doc, err
}
if len(metas) == 0 {
return doc, fmt.Errorf("tika returned empty metadata for %q", ri.Name)
}
for _, meta := range metas {
if title, err := getFirstValue(meta, "title"); err == nil {
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))

View File

@@ -427,12 +427,13 @@ func (b *Bleve) getResource(bleveIndex bleve.Index, id string) (*Resource, error
fields := res.Hits[0].Fields
return &Resource{
ID: getFieldValue[string](fields, "ID"),
RootID: getFieldValue[string](fields, "RootID"),
Path: getFieldValue[string](fields, "Path"),
ParentID: getFieldValue[string](fields, "ParentID"),
Type: uint64(getFieldValue[float64](fields, "Type")),
Deleted: getFieldValue[bool](fields, "Deleted"),
ID: getFieldValue[string](fields, "ID"),
RootID: getFieldValue[string](fields, "RootID"),
Path: getFieldValue[string](fields, "Path"),
ParentID: getFieldValue[string](fields, "ParentID"),
Type: uint64(getFieldValue[float64](fields, "Type")),
Deleted: getFieldValue[bool](fields, "Deleted"),
Extracted: getFieldValue[bool](fields, "Extracted"),
Document: content.Document{
Name: getFieldValue[string](fields, "Name"),
Title: getFieldValue[string](fields, "Title"),

View File

@@ -34,13 +34,14 @@ type Engine interface {
type Resource struct {
content.Document
ID string
RootID string
Path string
ParentID string
Type uint64
Deleted bool
Hidden bool
ID string
RootID string
Path string
ParentID string
Type uint64
Deleted bool
Hidden bool
Extracted bool
}
func resourceIDtoSearchID(id storageProvider.ResourceId) *searchMessage.ResourceID {

View File

@@ -463,7 +463,7 @@ func (s *Service) IndexSpace(spaceID *provider.StorageSpaceId) error {
s.logger.Debug().Str("path", ref.Path).Msg("Walking tree")
searchRes, err := s.engine.Search(ownerCtx, &searchsvc.SearchIndexRequest{
Query: "id:" + storagespace.FormatResourceID(info.Id) + ` mtime>=` + utils.TSToTime(info.Mtime).Format(time.RFC3339Nano),
Query: "id:" + storagespace.FormatResourceID(info.Id) + ` mtime>=` + utils.TSToTime(info.Mtime).Format(time.RFC3339Nano) + ` Extracted:true`,
})
if err == nil && len(searchRes.Matches) >= 1 {
@@ -531,9 +531,10 @@ func (s *Service) UpsertItem(ref *provider.Reference) {
OpaqueId: stat.Info.Id.SpaceId,
SpaceId: stat.Info.Id.SpaceId,
}),
Path: utils.MakeRelativePath(path),
Type: uint64(stat.Info.Type),
Document: doc,
Path: utils.MakeRelativePath(path),
Type: uint64(stat.Info.Type),
Document: doc,
Extracted: true,
}
r.Hidden = strings.HasPrefix(r.Path, ".")