mirror of
https://github.com/kevinboone/epub2txt2
synced 2026-04-25 14:24:56 +02:00
Added --separator
This commit is contained in:
2
Makefile
2
Makefile
@@ -1,4 +1,4 @@
|
||||
VERSION := 2.10
|
||||
VERSION := 2.11
|
||||
CC := gcc
|
||||
EXTRA_CFLAGS ?=
|
||||
EXTRA_LDLAGS ?=
|
||||
|
||||
236
README.md
236
README.md
@@ -1,84 +1,75 @@
|
||||
# epub2txt -- Extract text from EPUB documents
|
||||
|
||||
Version 2.10, September 2024
|
||||
Version 2.11, December 2024
|
||||
|
||||
## What is this?
|
||||
|
||||
`epub2txt` is a simple command-line utility for
|
||||
extracting text from
|
||||
EPUB documents and, optionally, re-flowing it to fit a text display
|
||||
of a particular number of columns. It is written entirely in ANSI-standard
|
||||
C, and should run on any Unix-like system with a C compiler. It is
|
||||
intended for reading EPUB e-books on embedded systems that can't host a
|
||||
graphical EPUB viewer, or converting such e-books to read on those systems.
|
||||
However, it should be robust enough for other purposes, such as batch
|
||||
indexing of EPUB document collections.
|
||||
`epub2txt` is a simple command-line utility for extracting text from EPUB
|
||||
documents and, optionally, re-flowing it to fit a text display of a particular
|
||||
number of columns. It is written entirely in ANSI-standard C, and should run on
|
||||
any Unix-like system with a C compiler. It is intended for reading EPUB e-books
|
||||
on embedded systems that can't host a graphical EPUB viewer, or converting such
|
||||
e-books to read on those systems. However, it should be robust enough for
|
||||
other purposes, such as batch indexing of EPUB document collections.
|
||||
|
||||
`epub2txt` favours speed and low memory usage over
|
||||
accuracy of rendering. Most of the formatting of the source document
|
||||
will be lost but, with a text-only display, this is likely to be of
|
||||
little consequence.
|
||||
`epub2txt` favours speed and low memory usage over accuracy of rendering. Most
|
||||
of the formatting of the source document will be lost but, with a text-only
|
||||
display, this is likely to be of little consequence.
|
||||
|
||||
This utility is specifically written to have no dependencies on external
|
||||
libraries, except the standard C library, and even on this it makes
|
||||
few demands. It does expect to be able to run an "unzip" command,
|
||||
however. The purpose of minimizing dependencies is to allow the
|
||||
utility to build on embedded systems without needing to build a bunch
|
||||
of dependencies first.
|
||||
libraries, except the standard C library, and even on this it makes few
|
||||
demands. It does expect to be able to run an "unzip" command, however. The
|
||||
purpose of minimizing dependencies is to allow the utility to build on embedded
|
||||
systems without needing to build a bunch of dependencies first.
|
||||
|
||||
`epub2txt` will output UTF8-encoded text by default, but can
|
||||
be told to output ASCII, in which case it will try to convert non-ASCII
|
||||
characters into something displayable if possible.
|
||||
`epub2txt` will output UTF8-encoded text by default, but can be told to output
|
||||
ASCII, in which case it will try to convert non-ASCII characters into something
|
||||
displayable if possible.
|
||||
|
||||
## Differences from epub2txt version 1.x
|
||||
|
||||
`epub2txt` version 2.0 is a more-or-less complete reimplementation,
|
||||
compared to the earlier 1.x releases. Not only has the internal logic been
|
||||
changed to improve multi-byte character support,
|
||||
but the command-line switches have been updated, to make
|
||||
the utility easier to use in the more common scenarios. Some features from 1.x
|
||||
have been ommitted in this new version, since they added complexity and
|
||||
did not seem to be
|
||||
used much.
|
||||
`epub2txt` version 2.0 is a more-or-less complete reimplementation, compared to
|
||||
the earlier 1.x releases. Not only has the internal logic been changed to
|
||||
improve multi-byte character support, but the command-line switches have been
|
||||
updated, to make the utility easier to use in the more common scenarios. Some
|
||||
features from 1.x have been omitted in this new version, since they added
|
||||
complexity and did not seem to be used much.
|
||||
|
||||
* All character processing is now done using 32-bit, rather than 8-bit,
|
||||
characters, so each character requires exactly 32 bits. This makes formatting
|
||||
text that contains non-English characters much easier and, hopefully, more
|
||||
accurate, since the program no longer has to do complicated fiddling with
|
||||
multi-byte characters
|
||||
* Where the source document uses "simple" formatting tags, like
|
||||
`<h1>` for headings and and `<b>`
|
||||
for bold, `epub2txt`
|
||||
will output ANSI highlighting characters if the program is run from a terminal. This feature can be turned off, if required, but will be used by default
|
||||
* `epub2txt` tries to determine the actual width of the
|
||||
terminal and, in most cases, the user should not have to specify it
|
||||
* It is assumed that output should be formatted to fit the width of a terminal, whenever a the program is run from a terminal. To override this behaviour,
|
||||
specify a width of zero (`-w 0`)
|
||||
* There is a new `--raw` switch that does essentially what
|
||||
`--notrim -w 0` did in the previous version, and also implies
|
||||
no ANSI highlighting (`--noansi`).
|
||||
characters, so each character requires exactly 32 bits. This makes formatting
|
||||
text that contains non-English characters much easier and, hopefully, more
|
||||
accurate, since the program no longer has to do complicated fiddling with
|
||||
multi-byte characters
|
||||
* Where the source document uses "simple" formatting tags, like `<h1>` for
|
||||
headings and and `<b>` for bold, `epub2txt` will output ANSI highlighting
|
||||
characters if the program is run from a terminal. This feature can be turned
|
||||
off, if required, but will be used by default
|
||||
* `epub2txt` tries to determine the actual width of the terminal and, in most
|
||||
cases, the user should not have to specify it
|
||||
* It is assumed that output should be formatted to fit the width of a terminal,
|
||||
whenever a the program is run from a terminal. To override this behaviour,
|
||||
specify a width of zero (`-w 0`)
|
||||
* There is a new `--raw` switch that does essentially what `--notrim -w 0` did
|
||||
in the previous version, and also implies
|
||||
no ANSI highlighting (`--noansi`).
|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
`epub2txt` is intended to run on Linux and other Unix-like
|
||||
systems. It makes use of the common Unix `unzip` utility
|
||||
but has no other dependencies.
|
||||
It builds and runs on Windows under Cygwin, and under the Windows
|
||||
10 Linux subsystem (WSL),
|
||||
but not as a native Windows console application.
|
||||
The system must be set up such that there is a temporary
|
||||
directory at `/tmp` that users can write to, unless the
|
||||
environment variable `TMP` is set, in which case that will
|
||||
be used instead.
|
||||
`epub2txt` is intended to run on Linux and other Unix-like systems. It makes
|
||||
use of the common Unix `unzip` utility but has no other dependencies. It
|
||||
builds and runs on Windows under Cygwin, and under the Windows 10 Linux
|
||||
subsystem (WSL), but not as a native Windows console application. The system
|
||||
must be set up such that there is a temporary directory at `/tmp` that users
|
||||
can write to, unless the environment variable `TMP` is set, in which case that
|
||||
will be used instead.
|
||||
|
||||
## Building and installing
|
||||
|
||||
`epub2txt` is already available for a number of Linux
|
||||
distributions, but to get the latest version it is usually best to
|
||||
build from source. This should be straightforward if `gcc`
|
||||
and `make` are installed.
|
||||
All you should need to do is
|
||||
`epub2txt` is already available for a number of Linux distributions, but to get
|
||||
the latest version it is usually best to build from source. This should be
|
||||
straightforward if `gcc` and `make` are installed. All you should need to do
|
||||
is
|
||||
|
||||
$ make
|
||||
$ sudo make install
|
||||
@@ -90,94 +81,94 @@ For a full list, run `epub2txt --help`.
|
||||
|
||||
`-a, --asiii`
|
||||
|
||||
Reduces multi-byte charaters to 7-bit ASCII if possible. Some very common
|
||||
characters are easily converted, like the various Unicode spaces, which
|
||||
can be converted into plain ASCII spaces. Common accented characters
|
||||
(.e.,g "é" are converted -- for better or worse -- into their
|
||||
non-accented equivalents. Some single-character entities like ©
|
||||
can be coverted into mult-character equivalents, like "(c)". What
|
||||
`epub2txt` <i>won't</i> do is to convert multi-byte
|
||||
characters into single-byte characters in some form of "extended ASCII"
|
||||
character set. Those days are gone, and I'm not going to help bring
|
||||
them back.
|
||||
Reduces multi-byte characters to 7-bit ASCII if possible. Some very common
|
||||
characters are easily converted, like the various Unicode spaces, which can be
|
||||
converted into plain ASCII spaces. Common accented characters (.e.,g "é"
|
||||
are converted -- for better or worse -- into their non-accented equivalents.
|
||||
Some single-character entities like © can be converted into
|
||||
multi-character equivalents, like "(c)". What `epub2txt` <i>won't</i> do is to
|
||||
convert multi-byte characters into single-byte characters in some form of
|
||||
"extended ASCII" character set. Those days are gone, and I'm not going to help
|
||||
bring them back. In any case, they're unlikely to display properly in a Linux
|
||||
terminal.
|
||||
|
||||
`-n, --noansi`
|
||||
|
||||
Don't output ANSI terminal highlights. If `epub2txt` is
|
||||
run from a console, it will interpret common HTML formatting in
|
||||
the source document (such as `<h1>` for headings and
|
||||
`<b>` for bold) by outputing
|
||||
ANSI highlight characters. Most (all?) Linux terminals understand
|
||||
these characters, and render the text with some sort of emphasis.
|
||||
In practice, most EPUB authors and converters
|
||||
don't use simple HTML markup (more's the pity), and even simple
|
||||
italic emphasis often uses custom style classes. So in many cases,
|
||||
no ANSI highlights will be seen. Moreover, some text processing
|
||||
utilties, like the common `more`, don't handle them
|
||||
properly. In such cases, use `--noansi` to switch
|
||||
this feature off.
|
||||
Don't output ANSI terminal highlights. If `epub2txt` is run from a console, it
|
||||
will interpret common HTML formatting in the source document (such as `<h1>`
|
||||
for headings and `<b>` for bold) by outputting ANSI highlight characters. Most
|
||||
(all?) Linux terminals understand these characters, and render the text with
|
||||
some sort of emphasis. In practice, most EPUB authors and converters don't use
|
||||
simple HTML markup (more's the pity), and even simple italic emphasis often
|
||||
uses custom style classes. So in many cases, no ANSI highlights will be seen.
|
||||
Moreover, some text processing utilities, like the common `more`, don't handle
|
||||
them properly. In such cases, use `--noansi` to switch this feature off.
|
||||
|
||||
`-r, --raw`
|
||||
|
||||
Don't process text data in any way -- just dump paragraphs of text
|
||||
exactly as they appear
|
||||
in the source document. Because some XHTML tags effectively create a
|
||||
paragraph break, without actually using explicit paragraph divisions,
|
||||
`epub2txt` will output a newline at the end of every such
|
||||
tag when it appears in the EPUB document. Without this treatment,
|
||||
many documents would render as one enormous
|
||||
line of text. However, sequences of empty lines might appear in the output.
|
||||
Don't process text data in any way -- just dump paragraphs of text exactly as
|
||||
they appear in the source document. Because some XHTML tags effectively create
|
||||
a paragraph break, without actually using explicit paragraph divisions,
|
||||
`epub2txt` will output a newline at the end of every such tag when it appears
|
||||
in the EPUB document. Without this treatment, many documents would render as
|
||||
one enormous line of text. However, sequences of empty lines might appear in
|
||||
the output.
|
||||
|
||||
`-s, --separator=text`
|
||||
|
||||
Writes the specified text on its own line before decoding each spine item. This
|
||||
option is intended for users who parse the output of `epub2txt` using scripts,
|
||||
and want to split the output up into chapters. A parser can detect the special
|
||||
text, and start a new chapters.
|
||||
|
||||
Users should be aware, however, that there is no mandatory connection between
|
||||
spine items and chapters. The effect of the `--separator` option will depend on
|
||||
the software used to author the EPUB.
|
||||
|
||||
`-w, --width=N`
|
||||
|
||||
Format the output for a display with N columns. If either the
|
||||
standard input or standard output of the `epub2txt` program
|
||||
is a terminal, the program will try to work out how wide it is. If it
|
||||
can't, it will assume 80 characters. The implication of using
|
||||
standard input to determine terminal width is that
|
||||
`epub2txt` still assumes it must produce fixed-width output,
|
||||
even if output is redirected to some other
|
||||
utility. This makes it possible to use `epub2txt` without
|
||||
specific command-line switches in common modes of operation like:
|
||||
Format the output for a display with N columns. If either the standard input or
|
||||
standard output of the `epub2txt` program is a terminal, the program will try
|
||||
to work out how wide it is. If it can't, it will assume 80 characters. The
|
||||
implication of using standard input to determine terminal width is that
|
||||
`epub2txt` still assumes it must produce fixed-width output, even if output is
|
||||
redirected to some other utility. This makes it possible to use `epub2txt`
|
||||
without specific command-line switches in common modes of operation like:
|
||||
|
||||
$ epub2txt myfile.epub | more
|
||||
|
||||
The `more` utility cannot wrap lines neatly on its own, so
|
||||
disabling line wrapping when `stdout` is redirected would
|
||||
create additional work for the user.
|
||||
|
||||
To turn off line wrapping specify `-w 0`, or `--raw`.
|
||||
The difference between these modes is that `-w 0` still
|
||||
collapses whitespace and mutliple blank lines, whilst `--raw`
|
||||
just outputs all text in the document, exactly as presented.
|
||||
The `more` utility cannot wrap lines neatly on its own, so disabling line
|
||||
wrapping when `stdout` is redirected would create additional work for the user.
|
||||
|
||||
To turn off line wrapping specify `-w 0`, or `--raw`. The difference between
|
||||
these modes is that `-w 0` still collapses whitespace and multiple blank lines,
|
||||
whilst `--raw` just outputs all text in the document, exactly as presented.
|
||||
|
||||
## Hints
|
||||
|
||||
_Make a list of all unique words in an EPUB file, for indexing purposes:_
|
||||
|
||||
$ epub2txt --raw file.epub |tr -cs A-Za-z\' '\n' | tr A-Z a-z |sort|uniq
|
||||
$ epub2txt --raw file.epub |tr -cs A-Za-z\' '\n' | tr A-Z a-z | sort| uniq
|
||||
|
||||
Using `--raw` here speeds things up, as there is no need for
|
||||
`epub2txt` to format the output, if it is just going to
|
||||
be used to make a word list.
|
||||
Using `--raw` here speeds things up, as there is no need for `epub2txt` to
|
||||
format the output, if it is just going to be used to make a word list.
|
||||
|
||||
_Read an EPUB on screen, with left justification_, using ANSI highlight codes
|
||||
for headings, etc., if the document uses simple formatting tags:
|
||||
|
||||
$ epub2txt file.epub | less -RS
|
||||
|
||||
This is a convenient way to read an EPUB document when a graphical
|
||||
viewer is not available. The `-R` switch to `less`
|
||||
tells it to respect ANSI highlight characters, which it can usually do
|
||||
without losing track of how much text is on a line.
|
||||
This is a convenient way to read an EPUB document when a graphical viewer is
|
||||
not available. The `-R` switch to `less` tells it to respect ANSI highlight
|
||||
characters, which it can usually do without losing track of how much text is on
|
||||
a line.
|
||||
|
||||
_Read an EPUB on screen with full (left and right) justification:_
|
||||
|
||||
$ epub2txt file.epub --noansi -w 0 | groff -K utf8 -Tascii | less
|
||||
|
||||
Note that `groff` can't handle ANSI terminal highlight
|
||||
characters as input, so these need to be disabled.
|
||||
Note that `groff` can't handle ANSI terminal highlight characters as input, so
|
||||
these need to be disabled.
|
||||
|
||||
|
||||
## Bugs and limitations
|
||||
@@ -201,10 +192,10 @@ because `epub2txt` does all its own character encoding conversions to avoid
|
||||
creating a dependency on an external library. Doing this for UTF8 is enough
|
||||
work on its own; doing it for arbitrary encodings would be overwhelming.
|
||||
|
||||
The program can't correct errors in encoding, and there are a large number of
|
||||
EPUB documents in public repositories that contain encoding errors. A common
|
||||
problem is spurious use of non-UTF8 8-bit characters, often in documents that
|
||||
have been converted from Microsoft Office applications.
|
||||
The program can't correct errors in encoding, and there are many EPUB documents
|
||||
in public repositories that contain encoding errors. A common problem is
|
||||
spurious use of non-UTF8 8-bit characters, often in documents that have been
|
||||
converted from Microsoft Office applications.
|
||||
|
||||
`epub2txt` does not right-justify text, as there are already many good
|
||||
utilities to do this (e.g., `groff`)
|
||||
@@ -244,6 +235,7 @@ covered.
|
||||
|
||||
Date | Change
|
||||
-----|-------
|
||||
2.11, Dec 2024 | Added '--separator' option
|
||||
2.10, Sep 2024 | Rejected links to documents outside the EPUB
|
||||
2.09, Aug 2024 | Improved failure mode wth certain corrupt EPUBs
|
||||
2.08, Jun 2024 | Fixed a memory-management warning
|
||||
|
||||
@@ -64,6 +64,12 @@ is implied. This is the fastest way to extract text, and is appropriate
|
||||
when feeding output to an external formatter such as \fIgroff\fR.
|
||||
.LP
|
||||
.TP
|
||||
.BI -s,\-\-separator=text
|
||||
Write the specified text to stdout before decoding each spine item.
|
||||
This option is intended to help users who want to split the output
|
||||
of \fIepub2txt\fR into chapters using scripts.
|
||||
.LP
|
||||
.TP
|
||||
.BI -w,\-\-width {columns}
|
||||
Format the output to fit into a specified width. If this option
|
||||
is
|
||||
@@ -73,11 +79,11 @@ the width is set to zero or less,
|
||||
then the output is assumed to be of
|
||||
unlimited width. Setting unlimited width can be useful in
|
||||
situations where the output is being processed
|
||||
by another application. between paragraphs).
|
||||
by another application.
|
||||
.LP
|
||||
.TP
|
||||
.BI -v,\-\-version
|
||||
Displays the version and copyright infomation.
|
||||
Displays the version and copyright information.
|
||||
.LP
|
||||
|
||||
.SH AUTHOR AND LEGAL
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*============================================================================
|
||||
epub2txt v2
|
||||
epub2txt.c
|
||||
Copyright (c)2020 Kevin Boone, GPL v3.0
|
||||
Copyright (c)2020-2024 Kevin Boone, GPL v3.0
|
||||
============================================================================*/
|
||||
|
||||
#define _GNU_SOURCE
|
||||
@@ -548,6 +548,9 @@ void epub2txt_do_file (const char *file, const Epub2TxtOptions *options,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (options->section_separator)
|
||||
printf ("%s\n", options->section_separator);
|
||||
|
||||
xhtml_file_to_stdout (opf, options, error);
|
||||
}
|
||||
list_destroy (list);
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
/*============================================================================
|
||||
epub2txt v2
|
||||
epub2txt.h
|
||||
Copyright (c)2017-20 Kevin Boone, GPL v3.0
|
||||
Copyright (c)2017-2024 Kevin Boone, GPL v3.0
|
||||
============================================================================*/
|
||||
|
||||
#pragma once
|
||||
@@ -17,6 +17,7 @@ typedef struct _Epub2TxtOptions
|
||||
BOOL meta; // Show metadata
|
||||
BOOL notext; // Don't dump text
|
||||
BOOL calibre; // Show Calibre metadata
|
||||
char *section_separator; // Section separator; may be NULL
|
||||
} Epub2TxtOptions;
|
||||
|
||||
void epub2txt_do_file (const char *file, const Epub2TxtOptions *options,
|
||||
|
||||
34
src/main.c
34
src/main.c
@@ -41,6 +41,7 @@ int main (int argc, char **argv)
|
||||
BOOL meta = FALSE;
|
||||
BOOL notext = FALSE;
|
||||
BOOL calibre = FALSE;
|
||||
char *section_separator = NULL;
|
||||
int width = 80;
|
||||
|
||||
static struct option long_options[] =
|
||||
@@ -53,6 +54,7 @@ int main (int argc, char **argv)
|
||||
{"noansi", no_argument, NULL, 'n'},
|
||||
{"width", required_argument, NULL, 'w'},
|
||||
{"log", required_argument, NULL, 'l'},
|
||||
{"separator", required_argument, NULL, 's'},
|
||||
{"help", no_argument, NULL, 'h'},
|
||||
{"notext", no_argument, NULL, 0},
|
||||
{0, 0, 0, 0}
|
||||
@@ -93,7 +95,7 @@ int main (int argc, char **argv)
|
||||
while (1)
|
||||
{
|
||||
int option_index = 0;
|
||||
opt = getopt_long (argc, argv, "avw:l:nrmch",
|
||||
opt = getopt_long (argc, argv, "avw:l:nrmchs:",
|
||||
long_options, &option_index);
|
||||
|
||||
if (opt == -1) break;
|
||||
@@ -121,6 +123,9 @@ int main (int argc, char **argv)
|
||||
meta = TRUE;
|
||||
else if (strcmp (long_options[option_index].name, "notext") == 0)
|
||||
notext = TRUE;
|
||||
else if (strcmp
|
||||
(long_options[option_index].name, "separator") == 0)
|
||||
section_separator = strdup (optarg);
|
||||
else
|
||||
exit (-1);
|
||||
case 'a':
|
||||
@@ -141,6 +146,8 @@ int main (int argc, char **argv)
|
||||
meta = TRUE; break;
|
||||
case 'w':
|
||||
width = atoi (optarg); break;
|
||||
case 's':
|
||||
section_separator = strdup (optarg); break;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -155,16 +162,17 @@ int main (int argc, char **argv)
|
||||
if (show_help)
|
||||
{
|
||||
printf ("Usage: %s [options] {files...}\n", argv[0]);
|
||||
printf (" -a,--ascii try to output ASCII only\n");
|
||||
printf (" -c,--calibre show Calibre metadata (with -m)\n");
|
||||
printf (" -h,--help show this message\n");
|
||||
printf (" -l,--log=N set log level, 0-4\n");
|
||||
printf (" -m,--meta dump document metadata\n");
|
||||
printf (" -n,--noansi don't output ANSI terminal codes\n");
|
||||
printf (" --notext don't output document body\n");
|
||||
printf (" -r,--raw no formatting at all\n");
|
||||
printf (" -v,--version show version\n");
|
||||
printf (" -w,--width=N set output width\n");
|
||||
printf (" -a,--ascii try to output ASCII only\n");
|
||||
printf (" -c,--calibre show Calibre metadata (with -m)\n");
|
||||
printf (" -h,--help show this message\n");
|
||||
printf (" -l,--log=N set log level, 0-4\n");
|
||||
printf (" -m,--meta dump document metadata\n");
|
||||
printf (" -n,--noansi don't output ANSI terminal codes\n");
|
||||
printf (" --notext don't output document body\n");
|
||||
printf (" -r,--raw no formatting at all\n");
|
||||
printf (" -s,--separator=text section separator text\n");
|
||||
printf (" -v,--version show version\n");
|
||||
printf (" -w,--width=N set output width\n");
|
||||
exit (0);
|
||||
}
|
||||
|
||||
@@ -182,6 +190,7 @@ int main (int argc, char **argv)
|
||||
options.meta = meta;
|
||||
options.notext = notext;
|
||||
options.calibre = calibre;
|
||||
options.section_separator = section_separator;
|
||||
|
||||
if (is_a_tty)
|
||||
options.ansi = TRUE;
|
||||
@@ -208,6 +217,7 @@ int main (int argc, char **argv)
|
||||
}
|
||||
}
|
||||
|
||||
exit (0);
|
||||
if (section_separator) free (section_separator);
|
||||
exit (0);
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user