Improved failure mode with certain corrupt EPUBs

2026-04-25 14:24:56 +02:00 · 2024-08-17 09:43:53 +01:00
parent 67b1308fbd
commit ecc059369c
5 changed files with 49 additions and 32 deletions
--- a/2
+++ b/2
@@ -1,4 +1,4 @@
-VERSION := 2.08
+VERSION := 2.09
 CC      := gcc
 EXTRA_CFLAGS ?= 
 EXTRA_LDLAGS ?= 
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # epub2txt -- Extract text from EPUB documents  

-Version 2.07, June 2024 
+Version 2.09, June 2024 

 ## What is this? 

@@ -244,6 +244,7 @@ covered.

 Date | Change
 -----|-------
+2.09,&nbsp;Aug&nbsp;2024 | Improved failure mode wth certain corrupt EPUBs 
 2.08,&nbsp;Jun&nbsp;2024 | Fixed a memory-management warning 
 ?,&nbsp;Jun&nbsp;2024 | Removed position-independent code attributes from defaults 
 2.07,&nbsp;Jun&nbsp;2024 | Improved clean-up if program killed in a pipe
--- a/2
+++ b/2
@@ -1,7 +1,7 @@
 Format characters include the white space after tokens,
 which doesn't show up with bold, etc., but is still wrong.

-There is additional white-space inserted after a format change is
+There is additional white-space inserted when a format change is
 followed by a numeric HTML entity. This _does_ show up, and it looks
 ugly. It's a format that's rarely used, but really needs fixing. 
 Unfortunately, this will require a radical change to the formatting
--- a/man1/epub2txt.1
+++ b/man1/epub2txt.1
@@ -1,9 +1,9 @@
-.\" Copyright (C) 2013-22 Kevin Boone 
+.\" Copyright (C) 2013-24 Kevin Boone 
 .\" Permission is granted to any individual or institution to use, copy, or
 .\" redistribute this software so long as all of the original files are
 .\" included, and that this copyright notice is retained.
 .\"
-.TH epub2txt 1 "June 2024"
+.TH epub2txt 1 "August 2024"
 .SH NAME
 epub2txt \- Extract text from EPUB documents 
 .SH SYNOPSIS
--- a/src/epub2txt.c
+++ b/src/epub2txt.c
@@ -257,17 +257,26 @@ List *epub2txt_get_items (const char *opf, char **error)
      {
      XMLNode *root = XMLDoc_root (&doc);

-      int i, l = root->n_children;
-      for (i = 0; i < l; i++)
-	{
-	XMLNode *r1 = root->children[i];
-	// Add workaround for bug #4 
-	if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
+      int l;
+      if (root)
+        {
+	int i;
+        l = root->n_children;
+	for (i = 0; i < l; i++)
 	  {
-	  manifest = r1;
-	  got_manifest = TRUE;
+	  XMLNode *r1 = root->children[i];
+	  // Add workaround for bug #4 
+	  if (strcmp (r1->tag, "manifest") == 0 || strstr (r1->tag, ":manifest"))
+	    {
+	    manifest = r1;
+	    got_manifest = TRUE;
+	    }
 	  }
-	}
+        }
+      else
+        {
+        log_warning ("'%s' has no root eleemnt -- corrupt EPUB?", opf);
+        }

      if (!got_manifest)
 	{
@@ -278,7 +287,7 @@ List *epub2txt_get_items (const char *opf, char **error)
   
      ret = list_create_strings();

-      for (i = 0; i < l; i++)
+      for (int i = 0; i < l; i++)
 	{
 	XMLNode *r1 = root->children[i];
 	// Add workaround for bug #4
@@ -358,33 +367,40 @@ String *epub2txt_get_root_file (const char *opf, char **error)
    if (XMLDoc_parse_buffer_DOM (buff_cstr, APPNAME, &doc))
      {
      XMLNode *root = XMLDoc_root (&doc);
-      int i, l = root->n_children;
-      for (i = 0; i < l; i++)
-	{
-	XMLNode *r1 = root->children[i];
-	if (strcmp (r1->tag, "rootfiles") == 0)
+      if (root)
+        {
+        int i, l = root->n_children;
+	for (i = 0; i < l; i++)
 	  {
-	  XMLNode *rootfiles = r1;
-	  int i, l = rootfiles->n_children;
-	  for (i = 0; i < l; i++)
+	  XMLNode *r1 = root->children[i];
+	  if (strcmp (r1->tag, "rootfiles") == 0)
 	    {
-	    XMLNode *r1 = rootfiles->children[i];
-	    if (strcmp (r1->tag, "rootfile") == 0)
+	    XMLNode *rootfiles = r1;
+	    int i, l = rootfiles->n_children;
+	    for (i = 0; i < l; i++)
 	      {
-	      int k, nattrs = r1->n_attributes;
-	      for (k = 0; k < nattrs; k++)
+	      XMLNode *r1 = rootfiles->children[i];
+	      if (strcmp (r1->tag, "rootfile") == 0)
 		{
-		char *name = r1->attributes[k].name;
-		char *value = r1->attributes[k].value;
-		if (strcmp (name, "full-path") == 0)
+		int k, nattrs = r1->n_attributes;
+		for (k = 0; k < nattrs; k++)
 		  {
-		  ret = string_create (value);
+		  char *name = r1->attributes[k].name;
+		  char *value = r1->attributes[k].value;
+		  if (strcmp (name, "full-path") == 0)
+		    {
+		    ret = string_create (value);
+		    }
 		  }
 		}
 	      }
 	    }
 	  }
-	}
+        }
+      else
+        {
+        log_warning ("No root element in '%s' -- corrupt EPUB?", opf);
+        }

      if (ret == NULL)
        asprintf (error, "container.xml does not specify a root file");