mirror of
https://github.com/kevinboone/epub2txt2
synced 2026-04-25 14:24:56 +02:00
Fixed handling of URL-encoded spine hrefs
This commit is contained in:
@@ -244,6 +244,7 @@ covered.
|
||||
|
||||
Date | Change
|
||||
-----|-------
|
||||
?, Jun 2022 | Fixed handling of URL-encoded spine href's
|
||||
2.06, Jun 2022 | Fixed bug in invoking unzip
|
||||
2.05, Apr 2022 | Fixed bug with empty metadata tags
|
||||
2.04, Apr 2022 | Improved handling of UTF-8 BOMs
|
||||
|
||||
@@ -312,7 +312,8 @@ List *epub2txt_get_items (const char *opf, char **error)
|
||||
char *val2 = r3->attributes[p].value;
|
||||
if (strcmp (name2, "href") == 0)
|
||||
{
|
||||
list_append (ret, strdup (val2));
|
||||
char *decoded_val2 = decode_url (val2);
|
||||
list_append (ret, decoded_val2);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
44
src/util.c
44
src/util.c
@@ -1,12 +1,14 @@
|
||||
/*============================================================================
|
||||
epub2txt v2
|
||||
util.c
|
||||
Copyright (c)2022 Marco Bonelli, GPL v3.0
|
||||
Copyright (c)2022 Marco Bonelli, Kevin Boone, GPL v3.0
|
||||
============================================================================*/
|
||||
|
||||
#include <errno.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <stdlib.h>
|
||||
#include <ctype.h>
|
||||
#include <signal.h>
|
||||
#include <sys/wait.h>
|
||||
#include "util.h"
|
||||
@@ -16,6 +18,7 @@
|
||||
run_command
|
||||
Run an helper command through fork + execvp, wait for it to finish and return
|
||||
its status. Log execvp errors, and abort execution if abort_on_error is TRUE.
|
||||
(Marco Bonelli)
|
||||
*==========================================================================*/
|
||||
int run_command (const char *const argv[], BOOL abort_on_error)
|
||||
{
|
||||
@@ -39,3 +42,42 @@ int run_command (const char *const argv[], BOOL abort_on_error)
|
||||
waitpid (pid, &status, 0);
|
||||
return status;
|
||||
}
|
||||
|
||||
/*==========================================================================
|
||||
Decode %xx in URL-type strings. The caller must free the resulting
|
||||
string, which will be no longer than the input.
|
||||
(Kevin Boone)
|
||||
*==========================================================================*/
|
||||
char *decode_url (const char *url)
|
||||
{
|
||||
char *ret = malloc (strlen (url) + 2);
|
||||
|
||||
int len = 0;
|
||||
for (; *url; len++)
|
||||
{
|
||||
if (*url == '%' && url[1] && url[2] &&
|
||||
isxdigit(url[1]) && isxdigit(url[2]))
|
||||
{
|
||||
char url1 = url[1];
|
||||
char url2 = url[2];
|
||||
url1 -= url1 <= '9' ? '0' : (url1 <= 'F' ? 'A' : 'a')-10;
|
||||
url2 -= url2 <= '9' ? '0' : (url2 <= 'F' ? 'A' : 'a')-10;
|
||||
ret[len] = 16 * url1 + url2;
|
||||
url += 3;
|
||||
continue;
|
||||
}
|
||||
else if (*url == '+')
|
||||
{
|
||||
/* I have not tested this piece of the function, because I have not
|
||||
seen any instances of '+' (meaning space) in a spine href */
|
||||
url += 1;
|
||||
ret[len] = ' ';
|
||||
}
|
||||
ret[len] = *url++;
|
||||
}
|
||||
ret[len] = '\0';
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*============================================================================
|
||||
epub2txt v2
|
||||
util.h
|
||||
Copyright (c)2022 Marco Bonelli, GPL v3.0
|
||||
Copyright (c)2022 Marco Bonelli, Kevin Boone GPL v3.0
|
||||
============================================================================*/
|
||||
|
||||
#pragma once
|
||||
@@ -9,3 +9,7 @@
|
||||
#include "defs.h"
|
||||
|
||||
int run_command (const char *const argv[], BOOL abort_on_error);
|
||||
|
||||
/** Decode %xx in URL-type strings. The caller must free the resulting
|
||||
string, which will be no longer than the input. */
|
||||
char *decode_url (const char *url);
|
||||
|
||||
@@ -530,6 +530,8 @@ WString *xhtml_transform_char (uint32_t c, BOOL to_ascii)
|
||||
============================================================================*/
|
||||
WString *xhtml_translate_entity (const WString *entity)
|
||||
{
|
||||
/* Program flow in this function is very ugly, and prone to memory
|
||||
leaks when modified. The whole thing needs to be rewritten */
|
||||
char out[20];
|
||||
IN
|
||||
char *in = wstring_to_utf8 (entity);
|
||||
@@ -569,8 +571,11 @@ WString *xhtml_translate_entity (const WString *entity)
|
||||
WString *ret = wstring_create_empty();
|
||||
wstring_append_c (ret, (uint32_t)v);
|
||||
OUT
|
||||
free (s);
|
||||
free (in);
|
||||
return ret;
|
||||
}
|
||||
free (s);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user