From fc84b8f84189c48d0c11c2ab113620bc37d630c4 Mon Sep 17 00:00:00 2001
From: Rod Vagg <rod@vagg.org>
Date: Tue, 21 Jul 2020 19:09:43 +1000
Subject: [PATCH 1/2] chore: docs inline

---
 README.md |  38 +++---
 doc.go    |  61 +++++++++
 hamt.go   | 369 ++++++++++++++++++++++++++++++++++++++++++++++--------
 hash.go   |  15 ++-
 uhamt.go  |   4 +
 5 files changed, 414 insertions(+), 73 deletions(-)
 create mode 100644 doc.go

diff --git a/README.md b/README.md
index f3b9327..758aa7f 100644
--- a/README.md
+++ b/README.md
@@ -6,28 +6,22 @@ go-hamt-ipld
 [![](https://img.shields.io/badge/freenode-%23ipfs-blue.svg?style=flat-square)](http://webchat.freenode.net/?channels=%23ipfs)
 [![Travis CI](https://travis-ci.org/ipfs/go-hamt-ipld.svg?branch=master)](https://travis-ci.org/ipfs/go-hamt-ipld)
 
-> A CHAMP HAMT implemented using ipld
-
-
-## Table of Contents
-
-- [Usage](#usage)
-- [API](#api)
-- [Contribute](#contribute)
-- [License](#license)
-
-
-## Examples
-
-```go
-// TODO
-```
-
-## Contribute
-
-PRs are welcome!
-
-Small note: If editing the Readme, please conform to the [standard-readme](https://github.com/RichardLitt/standard-readme) specification.
+**This package is a reference implementation of the IPLD HAMT used in the
+Filecoin blockchain.** It includes some optional flexibility such that it may
+be used for other purposes outside of Filecoin.
+
+HAMT is a ["hash array mapped trie"](https://en.wikipedia.org/wiki/Hash_array_mapped_trie).
+This implementation extends the standard form by including buckets for the
+key/value pairs at storage leaves and [CHAMP mutation semantics](https://michael.steindorfer.name/publications/oopsla15.pdf).
+The CHAMP invariant and mutation rules provide us with the ability to maintain
+canonical forms given any set of keys and their values, regardless of insertion
+order and intermediate data insertion and deletion. Therefore, for any given
+set of keys and their values, a HAMT using the same parameters and CHAMP
+semantics, the root node should always produce the same content identifier
+(CID).
+
+**See https://godoc.org/github.com/ipfs/go-hamt-ipld for more information and
+API details.**
 
 ## License
 
diff --git a/doc.go b/doc.go
new file mode 100644
index 0000000..b5dc12f
--- /dev/null
+++ b/doc.go
@@ -0,0 +1,61 @@
+/*
+Package hamt provides a reference implementation of the IPLD HAMT used in the
+Filecoin blockchain. It includes some optional flexibility such that it may be
+used for other purposes outside of Filecoin.
+
+HAMT is a "hash array mapped trie"
+https://en.wikipedia.org/wiki/Hash_array_mapped_trie. This implementation
+extends the standard form by including buckets for the key/value pairs at
+storage leaves and CHAMP mutation semantics
+https://michael.steindorfer.name/publications/oopsla15.pdf. The CHAMP invariant
+and mutation rules provide us with the ability to maintain canonical forms
+given any set of keys and their values, regardless of insertion order and
+intermediate data insertion and deletion. Therefore, for any given set of keys
+and their values, a HAMT using the same parameters and CHAMP semantics, the
+root node should always produce the same content identifier (CID).
+
+Algorithm Overview
+
+The HAMT algorithm hashes incoming keys and uses incrementing subsections of
+that hash digest at each level of its tree structure to determine the placement
+of either the entry or a link to a child node of the tree. A `bitWidth`
+determines the number of bits of the hash to use for index calculation at each
+level of the tree such that the root node takes the first `bitWidth` bits of
+the hash to calculate an index and as we move lower in the tree, we move along
+the hash by `depth x bitWidth` bits. In this way, a sufficiently randomizing
+hash function will generate a hash that provides a new index at each level of
+the data structure. An index comprising `bitWidth` bits will generate index
+values of `[ 0, 2^bitWidth )`. So a `bitWidth` of 8 will generate indexes of 0
+to 255 inclusive.
+
+Each node in the tree can therefore hold up to `2^bitWidth` elements of data,
+which we store in an array. In the this HAMT and the IPLD HashMap we store
+entries in buckets. A `Set(key, value)` mutation where the index generated at
+the root node for the hash of key denotes an array index that does not yet
+contain an entry, we create a new bucket and insert the key / value pair entry.
+In this way, a single node can theoretically hold up to
+`2^bitWidth x bucketSize` entries, where `bucketSize` is the maximum number of
+elements a bucket is allowed to contain ("collisions"). In practice, indexes do
+not distribute with perfect randomness so this maximum is theoretical. Entries
+stored in the node's buckets are stored in key-sorted order.
+
+Parameters
+
+This HAMT implementation:
+
+• Fixes the `bucketSize` to 3.
+
+• Defaults the `bitWidth` to 8, however within Filecoin it uses 5
+
+• Defaults the hash algorithm to the 64-bit variant of Murmur3-x64
+
+Further Reading
+
+The algorithm used here is identical to that of the IPLD HashMap algorithm
+specified at
+https://github.com/ipld/specs/blob/master/data-structures/hashmap.md. The
+specific parameters used by Filecoin and the DAG-CBOR block layout differ from
+the specification and are defined at
+https://github.com/ipld/specs/blob/master/data-structures/hashmap.md#Appendix-Filecoin-hamt-variant.
+*/
+package hamt
diff --git a/hamt.go b/hamt.go
index 6b6c42a..e399283 100644
--- a/hamt.go
+++ b/hamt.go
@@ -12,9 +12,56 @@ import (
 	xerrors "golang.org/x/xerrors"
 )
 
-const arrayWidth = 3
+//-----------------------------------------------------------------------------
+// Defaults
+
+const bucketSize = 3
 const defaultBitWidth = 8
 
+//-----------------------------------------------------------------------------
+// Errors
+
+// ErrNotFound is returned when a Find operation fails to locate the specified
+// key in the HAMT
+var ErrNotFound = fmt.Errorf("not found")
+
+// ErrMaxDepth is returned when the HAMT spans further than the hash function
+// is capable of representing. This can occur when sufficient hash collisions
+// (e.g. from a weak hash function and attacker-provided keys) extend leaf
+// nodes beyond the number of bits that a hash can represent. Or this can occur
+// on extremely large (likely impractical) HAMTs that are unable to be
+// represented with the hash function used. Hash functions with larger byte
+// output increase the maximum theoretical depth of a HAMT.
+var ErrMaxDepth = fmt.Errorf("attempted to traverse hamt beyond max depth")
+
+//-----------------------------------------------------------------------------
+// Serialized data structures
+
+// Node is a single point in the HAMT, encoded as an IPLD tuple in DAG-CBOR of
+// shape:
+//   [bytes, [Pointer...]]
+// where 'bytes' is the big.Int#Bytes() and the Pointers array is between 1 and
+// `2^bitWidth`.
+//
+// The Bitfield provides us with a mechanism to store a compacted array of
+// Pointers. Each bit in the Bitfield represents an element in a sparse array
+// where `1` indicates the element is present in the Pointers array and `0`
+// indicates it is omitted. To look-up a specific index in the Pointers array
+// you must first make a count of the number of `1`s (popcount) up to the
+// element you are looking for.
+// e.g. a Bitfield of `10010110000` shows that we have a 4 element Pointers
+// array. Indexes `[1]` and `[2]` are not present, but index `[3]` is at
+// the second position of our Pointers array.
+//
+// (Note: the `refmt` tags are ignored by cbor-gen which will generate an
+// array type rather than map.)
+//
+// The IPLD Schema representation of this data structure is as follows:
+//
+// 		type Node struct {
+// 			bitfield Bytes
+// 			pointers [Pointer]
+// 		} representation tuple
 type Node struct {
 	Bitfield *big.Int   `refmt:"bf"`
 	Pointers []*Pointer `refmt:"p"`
@@ -26,11 +73,80 @@ type Node struct {
 	store cbor.IpldStore
 }
 
+// Pointer is an element in a HAMT node's Pointers array, encoded as an IPLD
+// tuple in DAG-CBOR of shape:
+//   {"0": CID} or {"1": [KV...]}
+// Where a map with a single key of "0" contains a Link, where a map with a
+// single key of "1" contains a KV bucket. The map may contain only one of
+// these two possible keys.
+//
+// There are between 1 and 2^bitWidth of these Pointers in any HAMT node.
+//
+// A Pointer contains either a KV bucket of `bucketSize` (3) values or a link
+// (CID) to a child node. When a KV bucket overflows beyond `bucketSize`, the
+// bucket is replaced with a link to a newly created HAMT node which will
+// contain the `bucketSize+1` elements in its own Pointers array.
+//
+// (Note: the `refmt` tags are ignored by cbor-gen which will generate an
+// array type rather than map.)
+//
+// The IPLD Schema representation of this data structure is as follows:
+//
+// 		type Pointer union {
+//			&Node "0"
+// 			Bucket "1"
+// 		} representation keyed
+//
+//		type Bucket [KV]
+type Pointer struct {
+	KVs  []*KV   `refmt:"v,omitempty"`
+	Link cid.Cid `refmt:"l,omitempty"`
+
+	// cached node to avoid too many serialization operations
+	// TODO(rvagg): we should check that this is actually used optimally. Flush()
+	// performs a save of all of the cached nodes, but both Copy() and loadChild()
+	// will set them. In the case of loadChild() we're not expecting a mutation so
+	// a save is likely going to mean we incur unnecessary serialization when
+	// we've simply inspected the tree. Copy() will only set a cached form if
+	// it already exists on the source. It's unclear exactly what Flush() is good
+	// for in its current form. Users may also need an advisory about memory
+	// usage of large graphs since they don't have control over this outside of
+	// Flush().
+	cache *Node
+}
+
+// KV represents leaf storage within a HAMT node. A Pointer may hold up to
+// `bucketSize` KV elements, where each KV contains a key and value pair
+// stored by the user.
+//
+// Keys are represented as bytes
+//
+// The IPLD Schema representation of this data structure is as follows:
+//
+//		type KV struct {
+//			key Bytes
+//			value Any
+//		} representation tuple
+type KV struct {
+	Key   []byte
+	Value *cbg.Deferred
+}
+
+//-----------------------------------------------------------------------------
+// Options
+
 // Option is a function that configures the node
+//
+// See UseTreeBitWidth and UseHashFunction
 type Option func(*Node)
 
-// UseTreeBitWidth allows you to set the width of the HAMT tree
-// in bits (from 1-8) via a customized hash function
+// UseTreeBitWidth allows you to set a custom bitWidth of the HAMT in bits
+// (from 1-8).
+//
+// Passing in the returned Option to NewNode will generate a new HAMT that uses
+// the specified bitWidth.
+//
+// The default bitWidth is 8.
 func UseTreeBitWidth(bitWidth int) Option {
 	return func(nd *Node) {
 		if bitWidth > 0 && bitWidth <= 8 {
@@ -39,17 +155,30 @@ func UseTreeBitWidth(bitWidth int) Option {
 	}
 }
 
-// UseHashFunction allows you to set the hash function used by the HAMT. It
-// defaults to murmur3 but you should use sha256 when an attacker can pick the
-// keys.
+// UseHashFunction allows you to set the hash function used for internal
+// indexing by the HAMT.
+//
+// Passing in the returned Option to NewNode will generate a new HAMT that uses
+// the specified hash function.
+//
+// The default hash function is murmur3-x64 but you should use a
+// cryptographically secure function such as SHA2-256 if an attacker may be
+// able to pick the keys in order to avoid potential hash collision (tree
+// explosion) attacks.
 func UseHashFunction(hash func([]byte) []byte) Option {
 	return func(nd *Node) {
 		nd.hash = hash
 	}
 }
 
-// NewNode creates a new IPLD HAMT Node with the given store and given
-// options
+//-----------------------------------------------------------------------------
+// Instance and helpers functions
+
+// NewNode creates a new IPLD HAMT Node with the given IPLD store and any
+// additional options (bitWidth and hash function).
+//
+// This function creates a new HAMT that you can use directly and is also
+// used internally to create child nodes.
 func NewNode(cs cbor.IpldStore, options ...Option) *Node {
 	nd := &Node{
 		Bitfield: big.NewInt(0),
@@ -65,19 +194,15 @@ func NewNode(cs cbor.IpldStore, options ...Option) *Node {
 	return nd
 }
 
-type KV struct {
-	Key   []byte
-	Value *cbg.Deferred
-}
-
-type Pointer struct {
-	KVs  []*KV   `refmt:"v,omitempty"`
-	Link cid.Cid `refmt:"l,omitempty"`
-
-	// cached node to avoid too many serialization operations
-	cache *Node
-}
-
+// Find navigates through the HAMT structure to where key `k` should exist. If
+// the key is not found, an ErrNotFound error is returned. If the key is found
+// and the `out` parameter has an UnmarshalCBOR(Reader) method, the decoded
+// value is returned. If found and the `out` parameter is `nil`, then `nil`
+// will be returned (can be used to determine if a key exists where you don't
+// need the value, e.g. using the HAMT as a Set).
+//
+// Depending on the size of the HAMT, this method may load a large number of
+// child nodes via the HAMT's IpldStore.
 func (n *Node) Find(ctx context.Context, k string, out interface{}) error {
 	return n.getValue(ctx, &hashBits{b: n.hash([]byte(k))}, k, func(kv *KV) error {
 		// used to just see if the thing exists in the set
@@ -97,6 +222,8 @@ func (n *Node) Find(ctx context.Context, k string, out interface{}) error {
 	})
 }
 
+// FindRaw performs the same function as Find, but returns the raw bytes found
+// at the key's location (which may or may not be DAG-CBOR, see also SetRaw).
 func (n *Node) FindRaw(ctx context.Context, k string) ([]byte, error) {
 	var ret []byte
 	err := n.getValue(ctx, &hashBits{b: n.hash([]byte(k))}, k, func(kv *KV) error {
@@ -106,28 +233,48 @@ func (n *Node) FindRaw(ctx context.Context, k string) ([]byte, error) {
 	return ret, err
 }
 
+// Delete removes an entry entirely from the HAMT structure.
+//
+// This operation will result in the modification of _at least_ one IPLD block
+// via the IpldStore. Depending on the contents of the leaf node, this
+// operation may result in a node collapse to shrink the HAMT into its
+// canonical form for the remaining data. For an insufficiently random
+// collection of keys at the relevant leaf nodes such a collapse may cascade to
+// further nodes.
 func (n *Node) Delete(ctx context.Context, k string) error {
 	kb := []byte(k)
 	return n.modifyValue(ctx, &hashBits{b: n.hash(kb)}, kb, nil)
 }
 
-var ErrNotFound = fmt.Errorf("not found")
-var ErrMaxDepth = fmt.Errorf("attempted to traverse hamt beyond max depth")
-
+// handle the two Find operations in a recursive manner, where each node in the
+// HAMT we traverse we call this function again with the same parameters. Note
+// that `hv` contains state and `hv.Next()` is not idempotent. Each call
+// increments a counter for the number of bits consumed.
 func (n *Node) getValue(ctx context.Context, hv *hashBits, k string, cb func(*KV) error) error {
+	// hv.Next chomps off `bitWidth` bits from the hash digest. As we proceed
+	// down the tree, each node takes `bitWidth` more bits from the digest. If
+	// we attempt to take more bits than the digest contains, we hit max-depth
+	// and can't proceed.
 	idx, err := hv.Next(n.bitWidth)
 	if err != nil {
 		return ErrMaxDepth
 	}
 
+	// if the element expected at this node isn't here then we can be sure it
+	// doesn't exist in the HAMT.
 	if n.Bitfield.Bit(idx) == 0 {
 		return ErrNotFound
 	}
 
+	// otherwise, the value is either local or in a child
+
+	// perform a popcount of bits up to the `idx` to find `cindex`
 	cindex := byte(n.indexForBitPos(idx))
 
-	c := n.getChild(cindex)
+	c := n.getPointer(cindex)
 	if c.isShard() {
+		// if isShard, we have a pointer to a child that we need to load and
+		// delegate our find operation to
 		chnd, err := c.loadChild(ctx, n.store, n.bitWidth, n.hash)
 		if err != nil {
 			return err
@@ -136,15 +283,22 @@ func (n *Node) getValue(ctx context.Context, hv *hashBits, k string, cb func(*KV
 		return chnd.getValue(ctx, hv, k, cb)
 	}
 
+	// if not isShard, then the key/value pair is local and we need to retrieve
+	// it from the bucket. The bucket is sorted but only between 1 and
+	// `bucketSize` in length, so no need for fanciness.
 	for _, kv := range c.KVs {
 		if string(kv.Key) == k {
 			return cb(kv)
 		}
+		// TODO: getting here would indicate a malformed HAMT, return error of some
+		// kind
 	}
 
 	return ErrNotFound
 }
 
+// load a HAMT node from the IpldStore and pass on the (assumed) parameters
+// that are not stored with the node.
 func (p *Pointer) loadChild(ctx context.Context, ns cbor.IpldStore, bitWidth int, hash func([]byte) []byte) (*Node, error) {
 	if p.cache != nil {
 		return p.cache, nil
@@ -154,6 +308,9 @@ func (p *Pointer) loadChild(ctx context.Context, ns cbor.IpldStore, bitWidth int
 	if err != nil {
 		return nil, err
 	}
+	// these don't get set in LoadNode because we don't have the Options
+	// at this point but what is inherited from the parents may have varied
+	// from the defaults
 	out.bitWidth = bitWidth
 	out.hash = hash
 
@@ -161,7 +318,32 @@ func (p *Pointer) loadChild(ctx context.Context, ns cbor.IpldStore, bitWidth int
 	return out, nil
 }
 
+// LoadNode loads a HAMT Node from the IpldStore and configures it according
+// to any specified Option parameters. Where the parameters of this HAMT vary
+// from the defaults (hash function and bitWidth), those variations _must_ be
+// supplied here via Options otherwise the HAMT will not be readable.
+//
+// Users should consider how their HAMT parameters are stored or specified
+// along with their HAMT where the data is expected to have a long shelf-life
+// as future users will need to know the parameters of a HAMT being loaded in
+// order to decode it. Users should also NOT rely on the default parameters
+// of this library to remain the defaults long-term and have strategies in
+// place to manage variations.
 func LoadNode(ctx context.Context, cs cbor.IpldStore, c cid.Cid, options ...Option) (*Node, error) {
+	// TODO(rvagg): loaded nodes must be validated to make sure we have only
+	// the correct form of Nodes to avoid attacks from alternative implementations
+	// that feed us poorly formed data. Check that:
+	// 1. Pointers contains the correct number of elements defined by Bitfield
+	// 2. Pointers contain *only* a link or a bucket (this may already be done in
+	// the CBOR unmarshal but might be worth doing here so the check is all in
+	// one place)
+	// 3. Pointers with links have are DAG-CBOR multicodec
+	// 4. KV buckets contain strictly between 1 and bucketSize elements
+	// 5. KV buckets are ordered by key (bytewise comparison)
+	// 6. keys and values are valid (what are the rules? len(key)>0? can val be
+	// nul? etc.)
+	// 7. .. potentially we could validate the position of elements if we propagate
+	// the depth of this node so we know which bits to chomp off the hash digest.
 	var out Node
 	if err := cs.Get(ctx, c, &out); err != nil {
 		return nil, err
@@ -178,6 +360,10 @@ func LoadNode(ctx context.Context, cs cbor.IpldStore, c cid.Cid, options ...Opti
 	return &out, nil
 }
 
+// Calculate the total _byte weight_ of the HAMT by fetching each node
+// from the IpldStore and adding its raw byte size to the total. This
+// operation will exhaustively load every node of the HAMT so should not
+// be used lightly.
 func (n *Node) checkSize(ctx context.Context) (uint64, error) {
 	c, err := n.store.Put(ctx, n)
 	if err != nil {
@@ -207,6 +393,10 @@ func (n *Node) checkSize(ctx context.Context) (uint64, error) {
 	return totsize, nil
 }
 
+// Flush saves and purges any cached Nodes recursively from this Node through
+// its (cached) children. Cached nodes primarily exist through the use of
+// Copy() operations where the entire graph is instantiated in memory and each
+// child pointer exists in cached form.
 func (n *Node) Flush(ctx context.Context) error {
 	for _, p := range n.Pointers {
 		if p.cache != nil {
@@ -226,13 +416,8 @@ func (n *Node) Flush(ctx context.Context) error {
 	return nil
 }
 
-// SetRaw sets key k to cbor bytes raw
-func (n *Node) SetRaw(ctx context.Context, k string, raw []byte) error {
-	d := &cbg.Deferred{Raw: raw}
-	kb := []byte(k)
-	return n.modifyValue(ctx, &hashBits{b: n.hash(kb)}, kb, d)
-}
-
+// Set key k to value v, where v is has a MarshalCBOR(bytes.Buffer) method to
+// encode it.
 func (n *Node) Set(ctx context.Context, k string, v interface{}) error {
 	var d *cbg.Deferred
 
@@ -256,6 +441,31 @@ func (n *Node) Set(ctx context.Context, k string, v interface{}) error {
 	return n.modifyValue(ctx, &hashBits{b: n.hash(kb)}, kb, d)
 }
 
+// SetRaw is similar to Set but sets key k in the HAMT to raw bytes without
+// performing a DAG-CBOR marshal. The bytes may or may not be encoded DAG-CBOR
+// (see also FindRaw for fetching raw form).
+func (n *Node) SetRaw(ctx context.Context, k string, raw []byte) error {
+	d := &cbg.Deferred{Raw: raw}
+	kb := []byte(k)
+	return n.modifyValue(ctx, &hashBits{b: n.hash(kb)}, kb, d)
+}
+
+// When deleting elements, we need to perform a compaction such that there is
+// a single canonical form of any HAMT with a given set of key/value pairs.
+// Any node with less than `bucketSize` elements needs to be collapsed into a
+// bucket of Pointers in the parent node.
+// TODO(rvagg): I don't think the logic here is correct. A compaction should
+// occur strictly when: there are no links to child nodes remaining (assuming
+// we've cleaned them first and they haven't caused a cascading collapse to
+// here) and the number of direct elements (actual k/v pairs) in this node is
+// equal o bucketSize+1. Anything less than bucketSize+1 is invalid for a node
+// other than the root node (which probably won't have cleanChild() called on
+// it). e.g.
+// https://github.com/rvagg/iamap/blob/fad95295b013c8b4f0faac6dd5d9be175f6e606c/iamap.js#L333
+// If we perform this depth-first, then it's possible to see the collapse
+// cascade upward such that we end up with some parent node with a bucket with
+// only bucketSize elements. The canonical form of the HAMT requires that
+// any node that could be collapsed into a parent bucket is collapsed and.
 func (n *Node) cleanChild(chnd *Node, cindex byte) error {
 	l := len(chnd.Pointers)
 	switch {
@@ -269,8 +479,8 @@ func (n *Node) cleanChild(chnd *Node, cindex byte) error {
 			return nil
 		}
 
-		return n.setChild(cindex, ps)
-	case l <= arrayWidth:
+		return n.setPointer(cindex, ps)
+	case l <= bucketSize:
 		var chvals []*KV
 		for _, p := range chnd.Pointers {
 			if p.isShard() {
@@ -278,32 +488,47 @@ func (n *Node) cleanChild(chnd *Node, cindex byte) error {
 			}
 
 			for _, sp := range p.KVs {
-				if len(chvals) == arrayWidth {
+				if len(chvals) == bucketSize {
 					return nil
 				}
 				chvals = append(chvals, sp)
 			}
 		}
-		return n.setChild(cindex, &Pointer{KVs: chvals})
+		return n.setPointer(cindex, &Pointer{KVs: chvals})
 	default:
 		return nil
 	}
 }
 
+// Add a new value, update an existing value, or delete a value from the HAMT,
+// potentially recursively calling child nodes to find the exact location of
+// the entry in question and potentially collapsing nodes into buckets in
+// parent nodes where a deletion violates the canonical form rules (see
+// cleanNode()). Recursive calls use the same arguments on child nodes but
+// note that `hv.Next()` is not idempotent. Each call will increment the number
+// of bits chomped off the hash digest for this key.
 func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.Deferred) error {
 	idx, err := hv.Next(n.bitWidth)
 	if err != nil {
 		return ErrMaxDepth
 	}
 
+	// if the element expected at this node isn't here then we can be sure it
+	// doesn't exist in the HAMT already and can insert it at the appropriate
+	// position.
 	if n.Bitfield.Bit(idx) != 1 {
-		return n.insertChild(idx, k, v)
+		return n.insertKV(idx, k, v)
 	}
 
+	// otherwise, the value is either local or in a child
+
+	// perform a popcount of bits up to the `idx` to find `cindex`
 	cindex := byte(n.indexForBitPos(idx))
 
-	child := n.getChild(cindex)
+	child := n.getPointer(cindex)
 	if child.isShard() {
+		// if isShard, we have a pointer to a child that we need to load and
+		// delegate our modify operation to
 		chnd, err := child.loadChild(ctx, n.store, n.bitWidth, n.hash)
 		if err != nil {
 			return err
@@ -313,7 +538,10 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
 			return err
 		}
 
-		// CHAMP optimization, ensure trees look correct after deletions
+		// CHAMP optimization, ensure the HAMT retains its canonical form for the
+		// current data it contains. This may involve collapsing child nodes if
+		// they no longer contain enough elements to justify their stand-alone
+		// existence.
 		if v == nil {
 			if err := n.cleanChild(chnd, cindex); err != nil {
 				return err
@@ -323,11 +551,18 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
 		return nil
 	}
 
+	// if not isShard, then either the key/value pair is local here and can be
+	// modified (or deleted) here or needs to be added as a new child node if
+	// there is an overflow.
+
 	if v == nil {
+		// delete operation, find the child and remove it, compacting the bucket in
+		// the process
 		for i, p := range child.KVs {
 			if bytes.Equal(p.Key, k) {
 				if len(child.KVs) == 1 {
-					return n.rmChild(cindex, idx)
+					// last element in the bucket, remove it and update the bitfield
+					return n.rmPointer(cindex, idx)
 				}
 
 				copy(child.KVs[i:], child.KVs[i+1:])
@@ -338,7 +573,7 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
 		return ErrNotFound
 	}
 
-	// check if key already exists
+	// modify existing, check if key already exists
 	for _, p := range child.KVs {
 		if bytes.Equal(p.Key, k) {
 			p.Value = v
@@ -346,8 +581,16 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
 		}
 	}
 
-	// If the array is full, create a subshard and insert everything into it
-	if len(child.KVs) >= arrayWidth {
+	if len(child.KVs) >= bucketSize {
+		// bucket is full, create a child node (shard) with all existing bucket
+		// elements plus the new one and set it in the place of the bucket
+		// TODO(rvagg): this all of the modifyValue() calls are going to result
+		// in a store.Put(), this could be improved by allowing NewNode() to take
+		// the bulk set of elements, or modifying modifyValue() for the case
+		// where we know for sure that the elements will go into buckets and
+		// not cause an overflow - i.e. we just need to take each element, hash it
+		// and consume the correct number of bytes off the digest and figure out
+		// where it should be in the new node.
 		sub := NewNode(n.store)
 		sub.bitWidth = n.bitWidth
 		sub.hash = n.hash
@@ -368,10 +611,11 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
 			return err
 		}
 
-		return n.setChild(cindex, &Pointer{Link: c})
+		return n.setPointer(cindex, &Pointer{Link: c})
 	}
 
-	// otherwise insert the new element into the array in order
+	// otherwise insert the new element into the array in order, the ordering is
+	// important to retain canonical form
 	np := &KV{Key: k, Value: v}
 	for i := 0; i < len(child.KVs); i++ {
 		if bytes.Compare(k, child.KVs[i].Key) < 0 {
@@ -383,7 +627,10 @@ func (n *Node) modifyValue(ctx context.Context, hv *hashBits, k []byte, v *cbg.D
 	return nil
 }
 
-func (n *Node) insertChild(idx int, k []byte, v *cbg.Deferred) error {
+// Insert a new key/value pair into the current node at the specified index.
+// This will involve modifying the bitfield for that index and inserting a new
+// bucket containing the single key/value pair at that position.
+func (n *Node) insertKV(idx int, k []byte, v *cbg.Deferred) error {
 	if v == nil {
 		return ErrNotFound
 	}
@@ -397,12 +644,18 @@ func (n *Node) insertChild(idx int, k []byte, v *cbg.Deferred) error {
 	return nil
 }
 
-func (n *Node) setChild(i byte, p *Pointer) error {
+// Set a Pointer at a specific location, this doesn't modify the elements array
+// but assumes that what's there can be updated. This seems to mostly be useful
+// for tail calls.
+func (n *Node) setPointer(i byte, p *Pointer) error {
 	n.Pointers[i] = p
 	return nil
 }
 
-func (n *Node) rmChild(i byte, idx int) error {
+// Remove a child at a specified index, splicing the Pointers array to remove
+// it and updating the bitfield to specify that an element no longer exists at
+// that position.
+func (n *Node) rmPointer(i byte, idx int) error {
 	copy(n.Pointers[i:], n.Pointers[i+1:])
 	n.Pointers = n.Pointers[:len(n.Pointers)-1]
 	n.Bitfield.SetBit(n.Bitfield, idx, 0)
@@ -410,15 +663,26 @@ func (n *Node) rmChild(i byte, idx int) error {
 	return nil
 }
 
-func (n *Node) getChild(i byte) *Pointer {
+// Load a Pointer from the specified index of the Pointers array. The element
+// should exist in a properly formed HAMT.
+func (n *Node) getPointer(i byte) *Pointer {
 	if int(i) >= len(n.Pointers) || i < 0 {
+		// TODO(rvagg): I think this should be an error, there's an assumption in
+		// calling code that it's not null and a proper hash chomp shouldn't result
+		// in anything out of bounds
 		return nil
 	}
 
 	return n.Pointers[i]
 }
 
+// Copy a HAMT node and all of its contents. May be useful for mutation
+// operations where the original needs to be preserved in memory.
+//
+// This operation will also recursively clone any child nodes that are attached
+// as cached nodes.
 func (n *Node) Copy() *Node {
+	// TODO(rvagg): clarify what situations this method is actually useful for.
 	nn := NewNode(n.store)
 	nn.bitWidth = n.bitWidth
 	nn.hash = n.hash
@@ -443,10 +707,16 @@ func (n *Node) Copy() *Node {
 	return nn
 }
 
+// Pointers elements can either contain a bucket of local elements or be a
+// link to a child node. In the case of a link, isShard() returns true.
 func (p *Pointer) isShard() bool {
 	return p.Link.Defined()
 }
 
+// ForEach recursively calls function f on each k / val pair found in the HAMT.
+// This performs a full traversal of the graph and for large HAMTs can cause
+// a large number of loads from the IpldStore. This should not be used lightly
+// as it can incur large costs.
 func (n *Node) ForEach(ctx context.Context, f func(k string, val interface{}) error) error {
 	for _, p := range n.Pointers {
 		if p.isShard() {
@@ -460,7 +730,6 @@ func (n *Node) ForEach(ctx context.Context, f func(k string, val interface{}) er
 			}
 		} else {
 			for _, kv := range p.KVs {
-				// TODO: consider removing 'strings as keys' from every interface, go full-on bytes everywhere
 				if err := f(string(kv.Key), kv.Value); err != nil {
 					return err
 				}
diff --git a/hash.go b/hash.go
index bba6c39..a99dbb7 100644
--- a/hash.go
+++ b/hash.go
@@ -6,7 +6,9 @@ import (
 	"github.com/spaolacci/murmur3"
 )
 
-// hashBits is a helper that allows the reading of the 'next n bits' as an integer.
+// hashBits is a helper that allows the reading of the 'next n bits' of a
+// digest as an integer. State is retained and calls to `Next` will
+// increment the number of consumed bits.
 type hashBits struct {
 	b        []byte
 	consumed int
@@ -18,13 +20,24 @@ func mkmask(n int) byte {
 
 // Next returns the next 'i' bits of the hashBits value as an integer, or an
 // error if there aren't enough bits.
+// Not enough bits means that the tree is not large enough to contain the data.
+// Where the hash is providing a sufficient enough random distribution this
+// means that it is "full", Where the distribution is not sufficiently random
+// enough, this means there have been too many collisions. Where a user can
+// control keys (that are hashed) and the hash function has some
+// predictability, collisions can be forced by producing the same indexes at
+// (most) levels.
 func (hb *hashBits) Next(i int) (int, error) {
 	if hb.consumed+i > len(hb.b)*8 {
+		// TODO(rvagg): this msg looks like a UnixFS holdover, it's an overflow
+		// and should probably bubble up a proper Err*
 		return 0, fmt.Errorf("sharded directory too deep")
 	}
 	return hb.next(i), nil
 }
 
+// where 'i' is not '8', we need to read up to two bytes to extract the bits
+// for the index.
 func (hb *hashBits) next(i int) int {
 	curbi := hb.consumed / 8
 	leftb := 8 - (hb.consumed % 8)
diff --git a/uhamt.go b/uhamt.go
index e7cf8a4..8e750ea 100644
--- a/uhamt.go
+++ b/uhamt.go
@@ -8,6 +8,10 @@ import (
 // indexForBitPos returns the index within the collapsed array corresponding to
 // the given bit in the bitset.  The collapsed array contains only one entry
 // per bit set in the bitfield, and this function is used to map the indices.
+// This is similar to a popcount() operation but is limited to a certain index.
+// e.g. a Bitfield of `10010110000` shows that we have a 4 elements in the
+// associated array. Indexes `[1]` and `[2]` are not present, but index `[3]`
+// is at the second position of our Pointers array.
 func (n *Node) indexForBitPos(bp int) int {
 	return indexForBitPos(bp, n.Bitfield)
 }

From 3d5b3607bc9dfb5f2385b202c0845591397c2fa3 Mon Sep 17 00:00:00 2001
From: Rod Vagg <rod@vagg.org>
Date: Wed, 5 Aug 2020 14:33:15 +1000
Subject: [PATCH 2/2] fixup! chore: docs inline

---
 hamt.go | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/hamt.go b/hamt.go
index e399283..42d964e 100644
--- a/hamt.go
+++ b/hamt.go
@@ -82,9 +82,9 @@ type Node struct {
 //
 // There are between 1 and 2^bitWidth of these Pointers in any HAMT node.
 //
-// A Pointer contains either a KV bucket of `bucketSize` (3) values or a link
-// (CID) to a child node. When a KV bucket overflows beyond `bucketSize`, the
-// bucket is replaced with a link to a newly created HAMT node which will
+// A Pointer contains either a KV bucket of up to `bucketSize` (3) values or a
+// link (CID) to a child node. When a KV bucket overflows beyond `bucketSize`,
+// the bucket is replaced with a link to a newly created HAMT node which will
 // contain the `bucketSize+1` elements in its own Pointers array.
 //
 // (Note: the `refmt` tags are ignored by cbor-gen which will generate an
@@ -119,7 +119,7 @@ type Pointer struct {
 // `bucketSize` KV elements, where each KV contains a key and value pair
 // stored by the user.
 //
-// Keys are represented as bytes
+// Keys are represented as bytes.
 //
 // The IPLD Schema representation of this data structure is as follows:
 //
@@ -290,8 +290,6 @@ func (n *Node) getValue(ctx context.Context, hv *hashBits, k string, cb func(*KV
 		if string(kv.Key) == k {
 			return cb(kv)
 		}
-		// TODO: getting here would indicate a malformed HAMT, return error of some
-		// kind
 	}
 
 	return ErrNotFound