Fixed handling of URL-encoded spine hrefs

This commit is contained in:
Kevin Boone
2022-06-30 15:41:59 +01:00
parent ac4e73fa79
commit 54b41e8715
5 changed files with 56 additions and 3 deletions

View File

@@ -244,6 +244,7 @@ covered.
Date | Change
-----|-------
?, Jun 2022 | Fixed handling of URL-encoded spine href's
2.06, Jun 2022 | Fixed bug in invoking unzip
2.05, Apr 2022 | Fixed bug with empty metadata tags
2.04, Apr 2022 | Improved handling of UTF-8 BOMs

View File

@@ -312,7 +312,8 @@ List *epub2txt_get_items (const char *opf, char **error)
char *val2 = r3->attributes[p].value;
if (strcmp (name2, "href") == 0)
{
list_append (ret, strdup (val2));
char *decoded_val2 = decode_url (val2);
list_append (ret, decoded_val2);
}
}
}

View File

@@ -1,12 +1,14 @@
/*============================================================================
epub2txt v2
util.c
Copyright (c)2022 Marco Bonelli, GPL v3.0
Copyright (c)2022 Marco Bonelli, Kevin Boone, GPL v3.0
============================================================================*/
#include <errno.h>
#include <string.h>
#include <unistd.h>
#include <stdlib.h>
#include <ctype.h>
#include <signal.h>
#include <sys/wait.h>
#include "util.h"
@@ -16,6 +18,7 @@
run_command
Run an helper command through fork + execvp, wait for it to finish and return
its status. Log execvp errors, and abort execution if abort_on_error is TRUE.
(Marco Bonelli)
*==========================================================================*/
int run_command (const char *const argv[], BOOL abort_on_error)
{
@@ -39,3 +42,42 @@ int run_command (const char *const argv[], BOOL abort_on_error)
waitpid (pid, &status, 0);
return status;
}
/*==========================================================================
Decode %xx in URL-type strings. The caller must free the resulting
string, which will be no longer than the input.
(Kevin Boone)
*==========================================================================*/
char *decode_url (const char *url)
{
char *ret = malloc (strlen (url) + 2);
int len = 0;
for (; *url; len++)
{
if (*url == '%' && url[1] && url[2] &&
isxdigit(url[1]) && isxdigit(url[2]))
{
char url1 = url[1];
char url2 = url[2];
url1 -= url1 <= '9' ? '0' : (url1 <= 'F' ? 'A' : 'a')-10;
url2 -= url2 <= '9' ? '0' : (url2 <= 'F' ? 'A' : 'a')-10;
ret[len] = 16 * url1 + url2;
url += 3;
continue;
}
else if (*url == '+')
{
/* I have not tested this piece of the function, because I have not
seen any instances of '+' (meaning space) in a spine href */
url += 1;
ret[len] = ' ';
}
ret[len] = *url++;
}
ret[len] = '\0';
return ret;
}

View File

@@ -1,7 +1,7 @@
/*============================================================================
epub2txt v2
util.h
Copyright (c)2022 Marco Bonelli, GPL v3.0
Copyright (c)2022 Marco Bonelli, Kevin Boone GPL v3.0
============================================================================*/
#pragma once
@@ -9,3 +9,7 @@
#include "defs.h"
int run_command (const char *const argv[], BOOL abort_on_error);
/** Decode %xx in URL-type strings. The caller must free the resulting
string, which will be no longer than the input. */
char *decode_url (const char *url);

View File

@@ -530,6 +530,8 @@ WString *xhtml_transform_char (uint32_t c, BOOL to_ascii)
============================================================================*/
WString *xhtml_translate_entity (const WString *entity)
{
/* Program flow in this function is very ugly, and prone to memory
leaks when modified. The whole thing needs to be rewritten */
char out[20];
IN
char *in = wstring_to_utf8 (entity);
@@ -569,8 +571,11 @@ WString *xhtml_translate_entity (const WString *entity)
WString *ret = wstring_create_empty();
wstring_append_c (ret, (uint32_t)v);
OUT
free (s);
free (in);
return ret;
}
free (s);
}
else
{