Add fonts CLI command

desgeeko · Dec 26, 2024 · 818125a · 818125a
1 parent 9a7ac82
commit 818125a
Show file tree

Hide file tree

Showing 5 changed files with 65 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -42,6 +42,7 @@ You can get quick insights on a PDF file with these commands:
 - `overview` outputs text data about the structure and the metadata.
 - `disasm` outputs a dump of the file structure on the terminal.
 - `text` outputs extracted text spatially, as if it was a kind of scan.
+- `fonts` outputs list of fonts used.
 - `browse` outputs static html data that lets you browse the internal structure of the PDF file: the PDF source is pretty-printed and augmented with hyperlinks.
 
 ## API overview

diff --git a/docs/cli.html b/docs/cli.html
@@ -27,9 +27,16 @@ <h3><code>overview</code></h3>
 </ul>
 <h3><code>disasm</code></h3>
 <p>The output shows a terse and greppable view of the file internal structure.Please refer to the <a href='https://github.com/desgeeko/pdfsyntax/blob/main/docs/disassembler.md'>Disassembler article</a> for details.</p>
-<p></p>
 <h3><code>text</code></h3>
 <p>The output shows a full extract of the text content, with a spatial awareness: the algorithm <em>tries</em> to respect the original layout, as if characters of all sizes were approximately rendered on a fixed-size grid.</p>
+<h3><code>fonts</code></h3>
+<p>The output shows a list of fonts used in the file, with the following tabular data:</p>
+<ul><li>Name</li>
+<li>Type</li>
+<li>Encoding</li>
+<li>Object number and generation number, comma separated</li>
+<li>Number of pages where it occurs</li>
+</ul>
 <h3><code>browse</code></h3>
 <p>The generated HTML looks like the raw PDF file with the following additions:</p>
 <ul><li>Pretty-print dictionary object</li>

diff --git a/docs/cli.md b/docs/cli.md
@@ -20,10 +20,17 @@ The output shows information about:
 The output shows a terse and greppable view of the file internal structure.
 Please refer to the [Disassembler article](https://github.com/desgeeko/pdfsyntax/blob/main/docs/disassembler.md) for details.
 
-
 ### `text`
 The output shows a full extract of the text content, with a spatial awareness: the algorithm *tries* to respect the original layout, as if characters of all sizes were approximately rendered on a fixed-size grid.
 
+### `fonts`
+The output shows a list of fonts used in the file, with the following tabular data:
+- Name
+- Type
+- Encoding
+- Object number and generation number, comma separated
+- Number of pages where it occurs
+
 ### `browse`
 The generated HTML looks like the raw PDF file with the following additions:
 * Pretty-print dictionary object

diff --git a/pdfsyntax/api.py b/pdfsyntax/api.py
@@ -206,19 +206,32 @@ def add_text_annotation(doc: Doc, page_num: int, text: str, rect: list, opened:
 
 
 def fonts(doc: Doc) -> dict:
-    """Return for each font the pages where it appears."""
+    """Return each font with its attributes and the pages where it appears."""
     ret = {}
     nb = number_pages(doc)
     font_index = get_page_fonts(doc, list(range(nb)))
     for i, page_fonts in enumerate(font_index):
         for font in page_fonts:
+            o = page_fonts[font]['iref']
             n = page_fonts[font]['name'][1:]
             t = page_fonts[font]['type'][1:]
-            u = page_fonts[font]['to_unicode']
-            name = f'{n} ({t})'
-            if name not in ret:
-                ret[name] = {'pages': [], 'to_unicode': u}
-            ret[name]['pages'].append(i)
+            e = page_fonts[font]['encoding']
+            if type(e) == complex:
+                e = 'other'
+            else:
+                e = e[1:]
+            if page_fonts[font]['to_unicode']:
+                u = True
+            else:
+                u = False
+            if o not in ret:
+                ret[o] = {'name': n,
+                          'type': t,
+                          'encoding': e,
+                          'pages': [],
+                          'to_unicode': u,
+                          }
+            ret[o]['pages'].append(i)
     return ret
 
 

diff --git a/pdfsyntax/cli.py b/pdfsyntax/cli.py
@@ -14,7 +14,7 @@ def main():
                                      description='Navigate through the structure of a PDF file')
     parser.add_argument('command',
                         type=str,
-                        choices=['browse', 'disasm', 'overview', 'text'],
+                        choices=['browse', 'disasm', 'overview', 'fonts', 'text'],
                         help='Command')
     parser.add_argument('filename', type=str, help='PDF file name')
     args = parser.parse_args()
@@ -24,6 +24,8 @@ def main():
         dump_disasm(args.filename)
     elif args.command == 'overview':
         overview(args.filename)
+    elif args.command == 'fonts':
+        print_fonts(args.filename)
     elif args.command == 'text':
         spatial(args.filename)
 
@@ -222,6 +224,32 @@ def spatial(filename: str) -> None:
     return
 
 
+def print_generic_table(lines: list, col_widths: list) -> None:
+    """."""
+    text = ''
+    for line in lines:
+        for i, w in enumerate(col_widths):
+            text += f"{line[i][:w]:{w}} "
+        text += '\n'
+    print(text)
+    return
+
+
+def print_fonts(filename: str) -> None:
+    """Print fonts used in file."""
+    table = []
+    doc = readfile(filename)
+    fs = fonts(doc)
+    table.append(['Name', 'Type', 'Encoding', 'Obj', 'Pages'])
+    for iref in fs:
+        f = fs[iref]
+        o_num = f"{int(iref.imag)},{int(iref.real)}"
+        nb_pages = f"{len(f['pages'])}"
+        table.append([f['name'], f['type'], f['encoding'], o_num, nb_pages])
+    print_generic_table(table, [30, 10, 16, 8, 6])
+    return
+
+
 def overview(filename: str) -> None:
     """Print both structure and metadata of a file."""
     doc = readfile(filename)