diff --git a/README.md b/README.md
index 391a246..f39e74e 100644
--- a/README.md
+++ b/README.md
@@ -42,6 +42,7 @@ You can get quick insights on a PDF file with these commands:
- `overview` outputs text data about the structure and the metadata.
- `disasm` outputs a dump of the file structure on the terminal.
- `text` outputs extracted text spatially, as if it was a kind of scan.
+- `fonts` outputs list of fonts used.
- `browse` outputs static html data that lets you browse the internal structure of the PDF file: the PDF source is pretty-printed and augmented with hyperlinks.
## API overview
diff --git a/docs/cli.html b/docs/cli.html
index 0a98db5..c176fb6 100644
--- a/docs/cli.html
+++ b/docs/cli.html
@@ -27,9 +27,16 @@
overview
disasm
The output shows a terse and greppable view of the file internal structure.Please refer to the Disassembler article for details.
-
text
The output shows a full extract of the text content, with a spatial awareness: the algorithm tries to respect the original layout, as if characters of all sizes were approximately rendered on a fixed-size grid.
+fonts
+The output shows a list of fonts used in the file, with the following tabular data:
+- Name
+- Type
+- Encoding
+- Object number and generation number, comma separated
+- Number of pages where it occurs
+
browse
The generated HTML looks like the raw PDF file with the following additions:
- Pretty-print dictionary object
diff --git a/docs/cli.md b/docs/cli.md
index 6be2bef..6b975b0 100644
--- a/docs/cli.md
+++ b/docs/cli.md
@@ -20,10 +20,17 @@ The output shows information about:
The output shows a terse and greppable view of the file internal structure.
Please refer to the [Disassembler article](https://github.com/desgeeko/pdfsyntax/blob/main/docs/disassembler.md) for details.
-
### `text`
The output shows a full extract of the text content, with a spatial awareness: the algorithm *tries* to respect the original layout, as if characters of all sizes were approximately rendered on a fixed-size grid.
+### `fonts`
+The output shows a list of fonts used in the file, with the following tabular data:
+- Name
+- Type
+- Encoding
+- Object number and generation number, comma separated
+- Number of pages where it occurs
+
### `browse`
The generated HTML looks like the raw PDF file with the following additions:
* Pretty-print dictionary object
diff --git a/pdfsyntax/api.py b/pdfsyntax/api.py
index 0bf0597..8fdc52f 100644
--- a/pdfsyntax/api.py
+++ b/pdfsyntax/api.py
@@ -206,19 +206,32 @@ def add_text_annotation(doc: Doc, page_num: int, text: str, rect: list, opened:
def fonts(doc: Doc) -> dict:
- """Return for each font the pages where it appears."""
+ """Return each font with its attributes and the pages where it appears."""
ret = {}
nb = number_pages(doc)
font_index = get_page_fonts(doc, list(range(nb)))
for i, page_fonts in enumerate(font_index):
for font in page_fonts:
+ o = page_fonts[font]['iref']
n = page_fonts[font]['name'][1:]
t = page_fonts[font]['type'][1:]
- u = page_fonts[font]['to_unicode']
- name = f'{n} ({t})'
- if name not in ret:
- ret[name] = {'pages': [], 'to_unicode': u}
- ret[name]['pages'].append(i)
+ e = page_fonts[font]['encoding']
+ if type(e) == complex:
+ e = 'other'
+ else:
+ e = e[1:]
+ if page_fonts[font]['to_unicode']:
+ u = True
+ else:
+ u = False
+ if o not in ret:
+ ret[o] = {'name': n,
+ 'type': t,
+ 'encoding': e,
+ 'pages': [],
+ 'to_unicode': u,
+ }
+ ret[o]['pages'].append(i)
return ret
diff --git a/pdfsyntax/cli.py b/pdfsyntax/cli.py
index 1ce3ace..28876d7 100644
--- a/pdfsyntax/cli.py
+++ b/pdfsyntax/cli.py
@@ -14,7 +14,7 @@ def main():
description='Navigate through the structure of a PDF file')
parser.add_argument('command',
type=str,
- choices=['browse', 'disasm', 'overview', 'text'],
+ choices=['browse', 'disasm', 'overview', 'fonts', 'text'],
help='Command')
parser.add_argument('filename', type=str, help='PDF file name')
args = parser.parse_args()
@@ -24,6 +24,8 @@ def main():
dump_disasm(args.filename)
elif args.command == 'overview':
overview(args.filename)
+ elif args.command == 'fonts':
+ print_fonts(args.filename)
elif args.command == 'text':
spatial(args.filename)
@@ -222,6 +224,32 @@ def spatial(filename: str) -> None:
return
+def print_generic_table(lines: list, col_widths: list) -> None:
+ """."""
+ text = ''
+ for line in lines:
+ for i, w in enumerate(col_widths):
+ text += f"{line[i][:w]:{w}} "
+ text += '\n'
+ print(text)
+ return
+
+
+def print_fonts(filename: str) -> None:
+ """Print fonts used in file."""
+ table = []
+ doc = readfile(filename)
+ fs = fonts(doc)
+ table.append(['Name', 'Type', 'Encoding', 'Obj', 'Pages'])
+ for iref in fs:
+ f = fs[iref]
+ o_num = f"{int(iref.imag)},{int(iref.real)}"
+ nb_pages = f"{len(f['pages'])}"
+ table.append([f['name'], f['type'], f['encoding'], o_num, nb_pages])
+ print_generic_table(table, [30, 10, 16, 8, 6])
+ return
+
+
def overview(filename: str) -> None:
"""Print both structure and metadata of a file."""
doc = readfile(filename)