Narrative Website Import

From Gramps
Jump to: navigation, search
Outdated code

The Script below was written for the Gramps 2.2 NarrativeWeb and would need to be updated to work with a current version.


Please update or expand this section.

Use a GRAMPS v2.2-created Narrative Website report to restore your Gramps database.

The program below works by parsing the HTML website (also called "screen scraping") and places the data into a comma-separated value spreadsheet. The CSV can then be imported into Gramps.

This is an example of code for scraping data from a well-defined and predictable format webpage. It is not a general ’adaptable' scraper. The scraper itself was created as an emergency 'last' resort to recover partial data after a catastrophic hardware failure. Only a small fraction of the original database would be recoverable by scraping a Narrative Web Report generated website.

The process is dramatically more 'lossy' at each step:

  • The report normally is only used to export a filtered selection of Person (also, Living and Private Persons are typically excluded or redacted)
  • The report only writes webpages for a subset of the data that Gramps can collect.
  • The scraper only reads a few pages that the report generates.
  • The CSV Importer only import certain Gramps objects.

To run the program from the command line, provide the URL of the surname list, like:

 python > import.csv

Then, in Gramps you should be able to import the file "import.csv" into an empty database.


Use the following code (404 link gone) as a good starting point (As the Script was written for Gramps 2.2 NarrativeWeb and needs to be updated to work with a Gramps 3+ or higher version.

# Python script for sucking a GRAMPS Narrative Website back into

# By Doug Blank <>
# License: GPL
# (c) 2007

import os, sys, urllib, re

count = 0
person = {None: None}
family = {}
family_pair = {}
event = {}

def loadPerson(url, surname, firstname):
    global count, person, family, event, family_pair
    junk, handle = url.rsplit("/",1)
    handle = handle.replace(".html", "")
    print >> sys.stderr, "   ", count, surname, ", ", firstname
    count += 1
    pfp = urllib.urlopen(gurl + "/" + url)
    contents =
    state = None
    pairs = []
    pdata = {}
    children = []
    for line in contents.split("\n"):
        matches = re.findall("""<td class="(.*?)">(.*?)</td>""", line)
        for match in matches:
            key, data = match
            if key in ["box"]:
                pass # ignore
            elif key in ["field", "data", "category"]:
                pairs.append((key, data))
        if state == "Families" and line.startswith("<a href"): # child?
            matches = re.match("""<a href="(.*?)">(.*?)</a>.*""", line)
            if matches:
                match = matches.groups()[0]
                if "/ppl/" in match:
                    junk, chandle = match.rsplit("/", 1)
                    chandle = chandle.replace(".html","")
        elif "<h" in line:
            matches = re.match("<h.>(.*?)</h.>", line)
            if matches:
                if state != None:
                    if state == "Parents":
                        #print "      Parents:", pairs
                        father, mother = None, None
                        for i in range(len(pairs)):
                            if pairs[i][1] == "Father":
                                father = pairs[i+1][1]
                            if pairs[i][1] == "Mother":
                                mother = pairs[i+1][1]
                        if father:
                            father = father.replace("</a>", "")
                            if "/" in father:
                                junk, fhandle = father.rsplit("/", 1)
                                father, name = fhandle.split(".html",1)
                        if mother:
                            mother = mother.replace("</a>", "")
                            if "/" in mother:
                                junk, mhandle = mother.rsplit("/", 1)
                                mother, name = mhandle.split(".html",1)
                        if (father, mother) in family:
                            family[(father, mother)].append(handle)
                            family[(father, mother)] = [handle]
                    elif state == "Families":
                        #print "      Families:", pairs
                        mdata = {"me": handle}
                        mhandle = None
                        for (key, value) in pairs:
                            if key == "category":
                                mdata["type"] = value
                            elif key == "field":
                                mdata["spouse"] = value
                            elif key == "data":
                                value = value.replace("</a>", "")
                                if "/" in value:
                                    junk, handle_name = value.rsplit("/", 1)
                                    mhandle, name = handle_name.split(".html",1)
                        handles = [handle, mhandle]
                        #print "adding", handles, mdata
                        family_pair[tuple(handles)] = mdata
                    elif state == "Events":
                        #print "      Events:", pairs
                        event[(handle, pairs[0][1])] = pairs
                    elif state.strip() == (firstname + " " + surname).strip():
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                    elif state == "Pedigree":
                        state = None
                    elif state in ["Ancestors", "Narrative"]:
                    else: # name didn't match exactly
                        state = state.replace(surname, "")
                        state = state.replace(firstname, "")
                        suffix = state.strip()
                        pdata = {"surname": surname, 
                                 "firstname": firstname,
                                 "children": children,
                                 "suffix": ""}
                        if suffix:
                            pdata["suffix"] = suffix
                        for i in range(len(pairs)):
                            if pairs[i][0] == "field":
                                pdata[pairs[i][1]] = pairs[i+1][1]
                            i += 1
                        person[handle] = pdata
                    pass # new person
                pairs = []
                state = matches.groups()[0]

def loadSurname(url, surname):
    sfp = urllib.urlopen(gurl + "/" + url)
    contents =
    for line in contents.split("\n"):
        list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
        for surnameURL in list:
            url, firstname = surnameURL
            if url.endswith(".html") and "/ppl/" in url:
                prefix, purl = url.split("/ppl/")
                loadPerson("/ppl/" + purl, surname, firstname)

gurl = sys.argv[1] # URL of surnames
fp = urllib.urlopen(gurl) 
contents = # read in website
for line in contents.split("\n"):
    list = re.findall("""<a href="(.*?)">(.*?)</a>""", line)
    for surnameURL in list:
        url, surname = surnameURL
        if url.endswith(".html") and url.startswith("srn"):
            print >> sys.stderr, "Processing surname", surname, "..."

print "person,firstname,lastname,suffix,gender"
for h in person:
    if h:
        print '"%s","%s","%s","%s","%s"' % (h, person[h]["firstname"], 

for fam in family_pair:
    data = family_pair[fam]
    h1, h2 = fam
    p1, p2 = None, None
    if h1 in person:
        p1 = person[h1]
    if h2 in person:
        p2 = person[h2]
    if p1 and p2:
        if p1["Gender"] == "male" and p2["Gender"] == "female":
            if (h1, h2) in family:
                family[(h1,h2)] = [data["me"]]
            if (h2, h1) in family:
                family[(h2,h1)] = [data["me"]]

print "marriage,parent1,parent2"
count = 1
marriage = {}
for pair in family:
    marriage[pair] = "F%04d" % count
    print '"%s","%s","%s"' % (marriage[pair], pair[0], pair[1])
    count += 1

print "family,child"
for pair in family:
    kids = family[pair]
    kids = set(kids)
    for kid in kids:
        if (kid != pair[0]) or (kid != pair[1]):
            print '"%s","%s"' % (marriage[pair], kid)

See also

Read the following discussion about this (obsolete) code on the Gramps maillist: