Improved failure mode with certain corrupt EPUBs

This commit is contained in:
Kevin Boone
2024-08-17 09:43:53 +01:00
parent 67b1308fbd
commit ecc059369c
5 changed files with 49 additions and 32 deletions

View File

@@ -1,4 +1,4 @@
VERSION := 2.08
VERSION := 2.09
CC := gcc
EXTRA_CFLAGS ?=
EXTRA_LDLAGS ?=

View File

@@ -1,6 +1,6 @@
# epub2txt -- Extract text from EPUB documents
Version 2.07, June 2024
Version 2.09, June 2024
## What is this?
@@ -244,6 +244,7 @@ covered.
Date | Change
-----|-------
2.09, Aug 2024 | Improved failure mode wth certain corrupt EPUBs
2.08, Jun 2024 | Fixed a memory-management warning
?, Jun 2024 | Removed position-independent code attributes from defaults
2.07, Jun 2024 | Improved clean-up if program killed in a pipe

2
TODO
View File

@@ -1,7 +1,7 @@
Format characters include the white space after tokens,
which doesn't show up with bold, etc., but is still wrong.
There is additional white-space inserted after a format change is
There is additional white-space inserted when a format change is
followed by a numeric HTML entity. This _does_ show up, and it looks
ugly. It's a format that's rarely used, but really needs fixing.
Unfortunately, this will require a radical change to the formatting

View File

@@ -1,9 +1,9 @@
.\" Copyright (C) 2013-22 Kevin Boone
.\" Copyright (C) 2013-24 Kevin Boone
.\" Permission is granted to any individual or institution to use, copy, or
.\" redistribute this software so long as all of the original files are
.\" included, and that this copyright notice is retained.
.\"
.TH epub2txt 1 "June 2024"
.TH epub2txt 1 "August 2024"
.SH NAME
epub2txt \- Extract text from EPUB documents
.SH SYNOPSIS

View File

@@ -257,17 +257,26 @@ List *epub2txt_get_items (const char *opf, char **error)
{
XMLNode *root = XMLDoc_root (&doc);
int i, l = root->n_children;
for (i = 0; i < l; i++)
{
XMLNode *r1 = root->children[i];
// Add workaround for bug #4
if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
int l;
if (root)
{
int i;
l = root->n_children;
for (i = 0; i < l; i++)
{
manifest = r1;
got_manifest = TRUE;
XMLNode *r1 = root->children[i];
// Add workaround for bug #4
if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
{
manifest = r1;
got_manifest = TRUE;
}
}
}
}
else
{
log_warning ("'%s' has no root eleemnt -- corrupt EPUB?", opf);
}
if (!got_manifest)
{
@@ -278,7 +287,7 @@ List *epub2txt_get_items (const char *opf, char **error)
ret = list_create_strings();
for (i = 0; i < l; i++)
for (int i = 0; i < l; i++)
{
XMLNode *r1 = root->children[i];
// Add workaround for bug #4
@@ -358,33 +367,40 @@ String *epub2txt_get_root_file (const char *opf, char **error)
if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
{
XMLNode *root = XMLDoc_root (&doc);
int i, l = root->n_children;
for (i = 0; i < l; i++)
{
XMLNode *r1 = root->children[i];
if (strcmp (r1->tag, "rootfiles") == 0)
if (root)
{
int i, l = root->n_children;
for (i = 0; i < l; i++)
{
XMLNode *rootfiles = r1;
int i, l = rootfiles->n_children;
for (i = 0; i < l; i++)
XMLNode *r1 = root->children[i];
if (strcmp (r1->tag, "rootfiles") == 0)
{
XMLNode *r1 = rootfiles->children[i];
if (strcmp (r1->tag, "rootfile") == 0)
XMLNode *rootfiles = r1;
int i, l = rootfiles->n_children;
for (i = 0; i < l; i++)
{
int k, nattrs = r1->n_attributes;
for (k = 0; k < nattrs; k++)
XMLNode *r1 = rootfiles->children[i];
if (strcmp (r1->tag, "rootfile") == 0)
{
char *name = r1->attributes[k].name;
char *value = r1->attributes[k].value;
if (strcmp (name, "full-path") == 0)
int k, nattrs = r1->n_attributes;
for (k = 0; k < nattrs; k++)
{
ret = string_create (value);
char *name = r1->attributes[k].name;
char *value = r1->attributes[k].value;
if (strcmp (name, "full-path") == 0)
{
ret = string_create (value);
}
}
}
}
}
}
}
}
else
{
log_warning ("No root element in '%s' -- corrupt EPUB?", opf);
}
if (ret == NULL)
asprintf (error, "container.xml does not specify a root file");