decoder.go

// Decoding a Document
//
// Decoding a GEDCOM stream:
//
//   ged := "0 HEAD\n1 CHAR UTF-8"
//
//   decoder := gedcom.NewDecoder(strings.NewReader(ged))
//   document, err := decoder.Decode()
//   if err != nil {
//     panic(err)
//   }
//
// If you are reading from a file you can use NewDocumentFromGEDCOMFile:
//
//   document, err := gedcom.NewDocumentFromGEDCOMFile("family.ged")
//   if err != nil {
//       panic(err)
//   }
//
package gedcom

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"regexp"
	"strconv"
	"strings"
)

// See Decoder.consumeOptionalBOM().
var byteOrderMark = []byte{0xef, 0xbb, 0xbf}

// Decoder represents a GEDCOM decoder.
type Decoder struct {
	r *bufio.Reader

	// It is not valid for GEDCOM values to contain new lines or carriage
	// returns. However, some application dump data without correctly using the
	// CONT tags.
	//
	// Strictly speaking we should bail out with an error but there are too many
	// cases that are difficult to clean up for consumers so we offer and option
	// to permit it.
	//
	// When enabled any line than cannot be parsed will be considered an
	// extension of the previous line (including the new line character).
	AllowMultiLine bool

	// AllowInvalidIndents allows a child node to have an indent greater than +1
	// of the parent. AllowInvalidIndents is disabled by default because if this
	// happens the GEDCOM file is broken in some possibly serious way and
	// certainly not a valid GEDCOM file.
	//
	// The biggest problem with having the indents wrongly aligned is that nodes
	// that are expected to be a certain depth (such as NPFX inside a NAME) will
	// probably break or interfere with a traversal algorithm that is not
	// expecting the node to be there/at that level.
	//
	// Another important thing to note is that the incorrect indent level will
	// not be retained when writing the Document back to a GEDCOM.
	AllowInvalidIndents bool
}

// Create a new decoder to parse a reader that contain GEDCOM data.
func NewDecoder(r io.Reader) *Decoder {
	return &Decoder{
		r: bufio.NewReader(r),
	}
}

// Decode will parse the entire GEDCOM stream (until EOF is reached) and return
// a Document. If the GEDCOM stream is not valid then the document node will
// be nil and the error is returned.
//
// A blank GEDCOM or a GEDCOM that only contains empty lines is valid and a
// Document will be returned with zero nodes.
func (dec *Decoder) Decode() (*Document, error) {
	document := NewDocument()
	indents := Nodes{}
	var family *FamilyNode

	document.HasBOM = dec.consumeOptionalBOM()

	finished := false
	lineNumber := 0

	// Only used when AllowMultiLine is enabled.
	var previousNode Node

	for !finished {
		lineNumber++

		line, err := dec.readLine()
		if err != nil {
			if err != io.EOF {
				return nil, err
			}

			finished = true
		}

		// Skip blank lines.
		if line == "" {
			if dec.AllowMultiLine && previousNode != nil {
				previousNode.RawSimpleNode().value += "\n"
			}

			continue
		}

		node, indent, err := parseLine(line, document, family)
		if err != nil {
			if dec.AllowMultiLine && previousNode != nil {
				previousNode.RawSimpleNode().value += "\n" + line
				continue
			}

			return nil, fmt.Errorf("line %d: %s", lineNumber, err)
		}

		// Families cannot be nested so any children that appear after this node
		// will be attached to the most recently seen family. We do not need to
		// set this back to nil after we exit the family node.
		if f, ok := node.(*FamilyNode); ok {
			family = f
		}

		// Add a root node to the document.
		if indent == 0 {
			dec.trimNodeValue(previousNode)
			document.AddNode(node)
			previousNode = node

			// There can be multiple root nodes so make sure we always reset all
			// indent pointers.
			indents = Nodes{node}

			continue
		}

		if indent-1 >= len(indents) {
			// This means the file is not valid. I have seen it in very rare
			// cases. See full explanation in AllowInvalidIndents.
			if dec.AllowInvalidIndents {
				indent = len(indents)
			} else {
				panic(fmt.Sprintf(
					"indent is too large - missing parent? at line %d: %s",
					lineNumber, line))
			}
		}

		i := indents[indent-1]

		switch {
		case indent >= len(indents):
			// Descending one level. It is not valid for a child to have an
			// indent that is more than one greater than the parent. This would
			// be a corrupt GEDCOM and lead to a panic.
			indents = append(indents, node)

		case indent < len(indents)-1:
			// Moving back to a parent. It is possible for this leap to be
			// greater than one so trim the indent levels back as many times as
			// needed to represent the new indent level.
			indents = indents[:indent+1]
			indents[indent] = node

		default:
			// This case would be "indent == len(indents)-1" (the indent does
			// not change from the previous line). However, since it is the only
			// other logical possibility there is no need to evaluate it for the
			// case condition.
			//
			// Make sure we update the current indent with the new node so that
			// children get place on this node and not the previous one.
			indents[indent] = node
		}

		dec.trimNodeValue(previousNode)
		i.AddNode(node)

		previousNode = node
	}

	dec.trimNodeValue(previousNode)

	// Build the cache once.
	document.buildPointerCache()

	return document, nil
}

func (dec *Decoder) trimNodeValue(previousNode Node) {
	// When AllowMultiLine is enabled we have to be careful to trim the
	// surrounding spaces off the value so it can be interpreted correct.
	//
	// Another solution would be to ignore blank lines entirely, but then we
	// would miss the paragraph separators in multiline text.
	if !IsNil(previousNode) {
		newValue := strings.TrimSpace(previousNode.RawSimpleNode().value)
		previousNode.RawSimpleNode().value = newValue
	}
}

func (dec *Decoder) readLine() (string, error) {
	buf := new(bytes.Buffer)

	for {
		b, err := dec.r.ReadByte()
		if err != nil {
			return string(buf.Bytes()), err
		}

		// The line endings in the GEDCOM files can be different. A newline and
		// carriage return are both considered to be the end of the line and
		// empty lines are ignored so we can treat both of these characters as
		// independent line terminators.
		if b == '\n' || b == '\r' {
			break
		}

		buf.WriteByte(b)
	}

	return string(buf.Bytes()), nil
}

var lineRegexp = regexp.MustCompile(`^(\d) +(@[^@]+@ )?(\w+) ?(.*)?$`)

func parseLine(line string, document *Document, family *FamilyNode) (Node, int, error) {
	parts := lineRegexp.FindStringSubmatch(line)

	if len(parts) == 0 {
		return nil, 0, fmt.Errorf("could not parse: %s", line)
	}

	// Indent (required).
	indent, _ := strconv.Atoi(parts[1])

	// Pointer (optional).
	pointer := ""
	if parts[2] != "" {
		// Trim off the surrounding '@'.
		pointer = parts[2][1 : len(parts[2])-2]
	}

	// Tag (required).
	tag := TagFromString(parts[3])

	// Value (optional).
	value := parts[4]

	return newNode(document, family, tag, value, pointer), indent, nil
}

// NewNode creates a node with no children. It is also the correct way to
// create a shallow copy of a node.
//
// If the node tag is recognised as a more specific type, such as *DateNode then
// that will be returned. Otherwise a *SimpleNode will be used.
func NewNode(tag Tag, value, pointer string, children ...Node) Node {
	return newNodeWithChildren(nil, nil, tag, value, pointer, children)
}

func newNode(document *Document, family *FamilyNode, tag Tag, value, pointer string) Node {
	return newNodeWithChildren(document, family, tag, value, pointer, nil)
}

func newNodeWithChildren(document *Document, family *FamilyNode, tag Tag, value, pointer string, children Nodes) Node {
	var node Node

	switch tag {
	case TagBaptism:
		node = NewBaptismNode(value, children...)

	case TagBirth:
		node = NewBirthNode(value, children...)

	case TagBurial:
		node = NewBurialNode(value, children...)

	case TagChild:
		needsFamily(family, tag)

		node = newChildNode(family, value, children...)

	case TagDate:
		node = NewDateNode(value, children...)

	case TagDeath:
		node = NewDeathNode(value, children...)

	case TagEvent:
		node = NewEventNode(value, children...)

	case TagFamily:
		needsDocument(document, tag)

		node = newFamilyNode(document, pointer, children...)

	case UnofficialTagFamilySearchID1, UnofficialTagFamilySearchID2:
		node = NewFamilySearchIDNode(tag, value, children...)

	case TagFormat:
		node = NewFormatNode(value, children...)

	case TagHusband:
		needsFamily(family, tag)

		node = newHusbandNode(family, value, children...)

	case TagIndividual:
		needsDocument(document, tag)

		node = newIndividualNode(document, pointer, children...)

	case TagLatitude:
		node = NewLatitudeNode(value, children...)

	case TagLongitude:
		node = NewLongitudeNode(value, children...)

	case TagMap:
		node = NewMapNode(value, children...)

	case TagName:
		node = NewNameNode(value, children...)

	case TagNickname:
		node = NewNicknameNode(value, children...)

	case TagNote:
		node = NewNoteNode(value, children...)

	case TagPhonetic:
		node = NewPhoneticVariationNode(value, children...)

	case TagPlace:
		node = NewPlaceNode(value, children...)

	case TagResidence:
		node = NewResidenceNode(value, children...)

	case TagRomanized:
		node = NewRomanizedVariationNode(value, children...)

	case TagSex:
		node = NewSexNode(value)

	case TagSource:
		node = NewSourceNode(value, pointer, children...)

	case TagType:
		node = NewTypeNode(value, children...)

	case UnofficialTagUniqueID:
		node = NewUniqueIDNode(value, children...)

	case TagWife:
		needsFamily(family, tag)

		node = newWifeNode(family, value, children...)
	}

	if IsNil(node) {
		node = newSimpleNode(tag, value, pointer, children...)
	} else {
		simpleNode := node.RawSimpleNode()
		simpleNode.pointer = pointer
	}

	return node
}

func needsDocument(document *Document, tag Tag) {
	if document == nil {
		panic(fmt.Sprintf("cannot create %s without a document", tag))
	}
}

func needsFamily(family *FamilyNode, tag Tag) {
	if family == nil {
		panic(fmt.Sprintf("cannot create %s without a family", tag))
	}
}

// consumeOptionalBOM will test and discard the Byte Order Mark at the start of
// the stream.
//
// In order to keep the original stream as intact as possible when encoding the
// BOM will be written back if it existed originally.
//
// Use of a BOM is neither required nor recommended for UTF-8, but may be
// encountered in contexts where UTF-8 data is converted from other encoding
// forms that use a BOM or where the BOM is used as a UTF-8 signature. See the
// “Byte Order Mark” subsection in Section 16.8, Specials, for more information.
// - 2.6 Encoding Schemes, http://www.unicode.org/versions/Unicode5.0.0/ch02.pdf
func (dec *Decoder) consumeOptionalBOM() bool {
	possibleBOM, _ := dec.r.Peek(3)
	hasBOM := bytes.Compare(possibleBOM, byteOrderMark) == 0

	if hasBOM {
		dec.r.Discard(3)
	}

	return hasBOM
}