mirror of
https://github.com/kevinboone/epub2txt2
synced 2026-04-25 14:24:56 +02:00
Improved failure mode with certain corrupt EPUBs
This commit is contained in:
2
Makefile
2
Makefile
@@ -1,4 +1,4 @@
|
||||
VERSION := 2.08
|
||||
VERSION := 2.09
|
||||
CC := gcc
|
||||
EXTRA_CFLAGS ?=
|
||||
EXTRA_LDLAGS ?=
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# epub2txt -- Extract text from EPUB documents
|
||||
|
||||
Version 2.07, June 2024
|
||||
Version 2.09, June 2024
|
||||
|
||||
## What is this?
|
||||
|
||||
@@ -244,6 +244,7 @@ covered.
|
||||
|
||||
Date | Change
|
||||
-----|-------
|
||||
2.09, Aug 2024 | Improved failure mode wth certain corrupt EPUBs
|
||||
2.08, Jun 2024 | Fixed a memory-management warning
|
||||
?, Jun 2024 | Removed position-independent code attributes from defaults
|
||||
2.07, Jun 2024 | Improved clean-up if program killed in a pipe
|
||||
|
||||
2
TODO
2
TODO
@@ -1,7 +1,7 @@
|
||||
Format characters include the white space after tokens,
|
||||
which doesn't show up with bold, etc., but is still wrong.
|
||||
|
||||
There is additional white-space inserted after a format change is
|
||||
There is additional white-space inserted when a format change is
|
||||
followed by a numeric HTML entity. This _does_ show up, and it looks
|
||||
ugly. It's a format that's rarely used, but really needs fixing.
|
||||
Unfortunately, this will require a radical change to the formatting
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
.\" Copyright (C) 2013-22 Kevin Boone
|
||||
.\" Copyright (C) 2013-24 Kevin Boone
|
||||
.\" Permission is granted to any individual or institution to use, copy, or
|
||||
.\" redistribute this software so long as all of the original files are
|
||||
.\" included, and that this copyright notice is retained.
|
||||
.\"
|
||||
.TH epub2txt 1 "June 2024"
|
||||
.TH epub2txt 1 "August 2024"
|
||||
.SH NAME
|
||||
epub2txt \- Extract text from EPUB documents
|
||||
.SH SYNOPSIS
|
||||
|
||||
@@ -257,17 +257,26 @@ List *epub2txt_get_items (const char *opf, char **error)
|
||||
{
|
||||
XMLNode *root = XMLDoc_root (&doc);
|
||||
|
||||
int i, l = root->n_children;
|
||||
for (i = 0; i < l; i++)
|
||||
{
|
||||
XMLNode *r1 = root->children[i];
|
||||
// Add workaround for bug #4
|
||||
if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
|
||||
int l;
|
||||
if (root)
|
||||
{
|
||||
int i;
|
||||
l = root->n_children;
|
||||
for (i = 0; i < l; i++)
|
||||
{
|
||||
manifest = r1;
|
||||
got_manifest = TRUE;
|
||||
XMLNode *r1 = root->children[i];
|
||||
// Add workaround for bug #4
|
||||
if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
|
||||
{
|
||||
manifest = r1;
|
||||
got_manifest = TRUE;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning ("'%s' has no root eleemnt -- corrupt EPUB?", opf);
|
||||
}
|
||||
|
||||
if (!got_manifest)
|
||||
{
|
||||
@@ -278,7 +287,7 @@ List *epub2txt_get_items (const char *opf, char **error)
|
||||
|
||||
ret = list_create_strings();
|
||||
|
||||
for (i = 0; i < l; i++)
|
||||
for (int i = 0; i < l; i++)
|
||||
{
|
||||
XMLNode *r1 = root->children[i];
|
||||
// Add workaround for bug #4
|
||||
@@ -358,33 +367,40 @@ String *epub2txt_get_root_file (const char *opf, char **error)
|
||||
if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
|
||||
{
|
||||
XMLNode *root = XMLDoc_root (&doc);
|
||||
int i, l = root->n_children;
|
||||
for (i = 0; i < l; i++)
|
||||
{
|
||||
XMLNode *r1 = root->children[i];
|
||||
if (strcmp (r1->tag, "rootfiles") == 0)
|
||||
if (root)
|
||||
{
|
||||
int i, l = root->n_children;
|
||||
for (i = 0; i < l; i++)
|
||||
{
|
||||
XMLNode *rootfiles = r1;
|
||||
int i, l = rootfiles->n_children;
|
||||
for (i = 0; i < l; i++)
|
||||
XMLNode *r1 = root->children[i];
|
||||
if (strcmp (r1->tag, "rootfiles") == 0)
|
||||
{
|
||||
XMLNode *r1 = rootfiles->children[i];
|
||||
if (strcmp (r1->tag, "rootfile") == 0)
|
||||
XMLNode *rootfiles = r1;
|
||||
int i, l = rootfiles->n_children;
|
||||
for (i = 0; i < l; i++)
|
||||
{
|
||||
int k, nattrs = r1->n_attributes;
|
||||
for (k = 0; k < nattrs; k++)
|
||||
XMLNode *r1 = rootfiles->children[i];
|
||||
if (strcmp (r1->tag, "rootfile") == 0)
|
||||
{
|
||||
char *name = r1->attributes[k].name;
|
||||
char *value = r1->attributes[k].value;
|
||||
if (strcmp (name, "full-path") == 0)
|
||||
int k, nattrs = r1->n_attributes;
|
||||
for (k = 0; k < nattrs; k++)
|
||||
{
|
||||
ret = string_create (value);
|
||||
char *name = r1->attributes[k].name;
|
||||
char *value = r1->attributes[k].value;
|
||||
if (strcmp (name, "full-path") == 0)
|
||||
{
|
||||
ret = string_create (value);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
log_warning ("No root element in '%s' -- corrupt EPUB?", opf);
|
||||
}
|
||||
|
||||
if (ret == NULL)
|
||||
asprintf (error, "container.xml does not specify a root file");
|
||||
|
||||
Reference in New Issue
Block a user