mirror of
https://github.com/owncloud/ocis
synced 2026-04-25 17:25:21 +02:00
Merge pull request #12095 from paul43210/fix/search-incomplete-extraction
fix(search): prevent incomplete Tika extractions from blocking re-index
This commit is contained in:
26
changelog/unreleased/fix-search-incomplete-extraction.md
Normal file
26
changelog/unreleased/fix-search-incomplete-extraction.md
Normal file
@@ -0,0 +1,26 @@
|
||||
Bugfix: Prevent incomplete Tika extractions from permanently blocking re-index
|
||||
|
||||
When Tika returned HTTP 200 but its child processes (OCR, ImageMagick)
|
||||
failed due to resource limits, the search index received metadata but
|
||||
no content. The document was written to Bleve with the correct mtime,
|
||||
and subsequent reindexes skipped it because the id+mtime check passed.
|
||||
This left files permanently stuck as "indexed" with no searchable
|
||||
content.
|
||||
|
||||
Two fixes are applied:
|
||||
|
||||
1. Validate Tika responses: if `MetaRecursive()` returns an empty
|
||||
metadata list, it is now treated as an extraction error so the
|
||||
document is not written to the index.
|
||||
|
||||
2. Add an `Extracted` field to indexed resources. It is set to `true`
|
||||
only after successful extraction. The reindex skip check now requires
|
||||
`Extracted:true`, so incompletely indexed documents are automatically
|
||||
re-processed on the next reindex run.
|
||||
|
||||
Note: existing search indexes will trigger a full re-extraction on the
|
||||
next reindex because documents written before this change lack the
|
||||
`Extracted` field.
|
||||
|
||||
https://github.com/owncloud/ocis/pull/12095
|
||||
https://github.com/owncloud/ocis/issues/12093
|
||||
@@ -87,6 +87,10 @@ func (t Tika) Extract(ctx context.Context, ri *provider.ResourceInfo) (Document,
|
||||
return doc, err
|
||||
}
|
||||
|
||||
if len(metas) == 0 {
|
||||
return doc, fmt.Errorf("tika returned empty metadata for %q", ri.Name)
|
||||
}
|
||||
|
||||
for _, meta := range metas {
|
||||
if title, err := getFirstValue(meta, "title"); err == nil {
|
||||
doc.Title = strings.TrimSpace(fmt.Sprintf("%s %s", doc.Title, title))
|
||||
|
||||
@@ -427,12 +427,13 @@ func (b *Bleve) getResource(bleveIndex bleve.Index, id string) (*Resource, error
|
||||
fields := res.Hits[0].Fields
|
||||
|
||||
return &Resource{
|
||||
ID: getFieldValue[string](fields, "ID"),
|
||||
RootID: getFieldValue[string](fields, "RootID"),
|
||||
Path: getFieldValue[string](fields, "Path"),
|
||||
ParentID: getFieldValue[string](fields, "ParentID"),
|
||||
Type: uint64(getFieldValue[float64](fields, "Type")),
|
||||
Deleted: getFieldValue[bool](fields, "Deleted"),
|
||||
ID: getFieldValue[string](fields, "ID"),
|
||||
RootID: getFieldValue[string](fields, "RootID"),
|
||||
Path: getFieldValue[string](fields, "Path"),
|
||||
ParentID: getFieldValue[string](fields, "ParentID"),
|
||||
Type: uint64(getFieldValue[float64](fields, "Type")),
|
||||
Deleted: getFieldValue[bool](fields, "Deleted"),
|
||||
Extracted: getFieldValue[bool](fields, "Extracted"),
|
||||
Document: content.Document{
|
||||
Name: getFieldValue[string](fields, "Name"),
|
||||
Title: getFieldValue[string](fields, "Title"),
|
||||
|
||||
@@ -34,13 +34,14 @@ type Engine interface {
|
||||
type Resource struct {
|
||||
content.Document
|
||||
|
||||
ID string
|
||||
RootID string
|
||||
Path string
|
||||
ParentID string
|
||||
Type uint64
|
||||
Deleted bool
|
||||
Hidden bool
|
||||
ID string
|
||||
RootID string
|
||||
Path string
|
||||
ParentID string
|
||||
Type uint64
|
||||
Deleted bool
|
||||
Hidden bool
|
||||
Extracted bool
|
||||
}
|
||||
|
||||
func resourceIDtoSearchID(id storageProvider.ResourceId) *searchMessage.ResourceID {
|
||||
|
||||
@@ -463,7 +463,7 @@ func (s *Service) IndexSpace(spaceID *provider.StorageSpaceId) error {
|
||||
s.logger.Debug().Str("path", ref.Path).Msg("Walking tree")
|
||||
|
||||
searchRes, err := s.engine.Search(ownerCtx, &searchsvc.SearchIndexRequest{
|
||||
Query: "id:" + storagespace.FormatResourceID(info.Id) + ` mtime>=` + utils.TSToTime(info.Mtime).Format(time.RFC3339Nano),
|
||||
Query: "id:" + storagespace.FormatResourceID(info.Id) + ` mtime>=` + utils.TSToTime(info.Mtime).Format(time.RFC3339Nano) + ` Extracted:true`,
|
||||
})
|
||||
|
||||
if err == nil && len(searchRes.Matches) >= 1 {
|
||||
@@ -531,9 +531,10 @@ func (s *Service) UpsertItem(ref *provider.Reference) {
|
||||
OpaqueId: stat.Info.Id.SpaceId,
|
||||
SpaceId: stat.Info.Id.SpaceId,
|
||||
}),
|
||||
Path: utils.MakeRelativePath(path),
|
||||
Type: uint64(stat.Info.Type),
|
||||
Document: doc,
|
||||
Path: utils.MakeRelativePath(path),
|
||||
Type: uint64(stat.Info.Type),
|
||||
Document: doc,
|
||||
Extracted: true,
|
||||
}
|
||||
r.Hidden = strings.HasPrefix(r.Path, ".")
|
||||
|
||||
|
||||
Reference in New Issue
Block a user