Building a Better Legal Search Engine, Part 1: Searching the U.S. Code

The first part in a blog series leading up to a keynote on Law and Computation at the University of Houston Law Center focuses on indexing and searching the U.S. Code with structured, public domain data and open source software.

Before diving into the technical aspects, some background on the U.S. Code. After a bill is passed and becomes Public Law, it is published in the Statutes at Large — the authoritative, chronological compilation of all enacted law. However, the Statutes at Large is sorted by date of enactment, not by concept; it contains laws that may affect multiple legal concepts, reference other laws, and amend or repeal other laws. This makes exhaustive search impractical.

The U.S. Code, produced by the Office of the Law Revision Counsel, solves this by organizing the law by concept (hierarchically), combining laws that reference one another, removing expired or repealed laws, and providing convenient citations. The LRC distributes copies of the Code in XHTML, which we use to build our index.

To build a legal search engine, the Code is arguably the best place to start. While there are other important sources like the Code of Federal Regulations or the Federal Reporter, the Code is as close to capital-L Law as it gets.

We use the Apache Lucene library to build an index of the Code from the 2009 and 2010 LRC snapshots. Lucene is a high-performance, full-featured text search engine library written entirely in Java. The process took a little over two minutes on a laptop. The buildCodeIndex.java program extracts XHTML files, tokenizes documents into sections, and passes them through Lucene's analyzer (which stems tokens and strips stopwords).

/**
 * @author Michael J Bommarito II
 * @date Apr 9, 2011
 * @license MIT, (C) Michael J Bommarito II 2011
 */
package org.mjb;

// Java standard library imports
import java.io.*;
import java.util.*;
import java.util.regex.*;
import java.util.zip.*;

// Lucene imports
import org.apache.lucene.*;
import org.apache.lucene.analysis.*;
import org.apache.lucene.analysis.standard.*;
import org.apache.lucene.document.*;
import org.apache.lucene.index.*;
import org.apache.lucene.store.*;
import org.apache.lucene.util.Version;
import org.htmlparser.*;
import org.htmlparser.visitors.*;

class CodeIndex {
    // Lucene Index objects
    private IndexWriter indexWriter;
    // private IndexWriterConfig indexWriterConfig;

    // Pattern matching regular expression objects
    private Pattern patternDocumentID, patternUSCKey, patternCurrentThrough,
	    patternItemPath;

    /**
     * Constructor that initializes the Lucene index and regular expression
     * objects.
     * 
     * @param indexPath
     * @throws IOException
     */
    public CodeIndex(String indexPath) throws IOException {
	// Create the index directory
	Directory indexDir = FSDirectory.open(new File(indexPath));

	// This syntax works for 3.1; however, Mahout won't play nice, so we
	// need to use an older version.
	// indexWriterConfig = new IndexWriterConfig(Version.LUCENE_31, new
	// StandardAnalyzer(Version.LUCENE_31));
	// indexWriter = new IndexWriter(indexDir, indexWriterConfig);

	// Construct the Lucene index.
	indexWriter = new IndexWriter(indexDir, new StandardAnalyzer(
		Version.LUCENE_29), true, IndexWriter.MaxFieldLength.UNLIMITED);

	// Compile the regular expressions
	patternDocumentID = Pattern.compile("documentid:([^\\s]+)");
	patternUSCKey = Pattern.compile("usckey:([^\\s]+)");
	patternCurrentThrough = Pattern.compile("currentthrough:([0-9]+)");
	patternItemPath = Pattern.compile("itempath:(.+) -->");
    }

    /**
     * This method shuts down the Lucene index.
     * 
     * @throws IOException
     */
    public void close() throws IOException {
	// Shut down the indexer, but wait for merges
	indexWriter.close();
    }

    /**
     * This method parses an HTML document and adds it to the index.
     * 
     * @param htmlBuffer
     */
    public void parseHTML(final String htmlBuffer) {
	// Keep track of our position in the buffer.
	int fragmentStart = 0, fragmentEnd = 0;

	String documentString = "";
	fragmentStart = htmlBuffer.indexOf("<!-- documentid");

	while (fragmentStart > 0) {
	    // Find the next document or end of file.
	    fragmentEnd = htmlBuffer.indexOf("<!-- documentid",
		    fragmentStart + 1);

	    // Store the substring.
	    if (fragmentEnd > 0) {
		documentString = htmlBuffer.substring(fragmentStart,
			fragmentEnd);
	    } else {
		documentString = htmlBuffer.substring(fragmentStart);
	    }

	    // Now parse the document.
	    parseDocument(documentString);

	    // Set up the next search.
	    fragmentStart = fragmentEnd;
	}
    }

    private String extractDocumentID(final String documentBuffer) {
	/**
	 * Extract the documentid metadata from the <document>.
	 */
	// Get the string positions
	int fragmentStart = documentBuffer.indexOf("documentid:")
		+ "documentid:".length();
	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	    return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	    return documentBuffer.substring(fragmentStart);
	}
    }

    private String extractUSCKey(final String documentBuffer) {
	/**
	 * Extract the usckey metadata from the <document>.
	 */
	// Get the string positions
	int fragmentStart = documentBuffer.indexOf("usckey:")
		+ "usckey:".length();
	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	    return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	    return documentBuffer.substring(fragmentStart);
	}
    }

    private String extractCurrentThrough(final String documentBuffer) {
	/**
	 * Extract the currentthrough metadata from the <document>.
	 */
	// Get the string positions
	int fragmentStart = documentBuffer.indexOf("currentthrough:")
		+ "currentthrough:".length();
	int fragmentEnd = documentBuffer.indexOf(" ", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	    return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	    return documentBuffer.substring(fragmentStart);
	}
    }

    private String extractItemPath(final String documentBuffer) {
	/**
	 * Extract the itempath metadata from the <document>.
	 */

	// Get the string positions.
	int fragmentStart = documentBuffer.indexOf("itempath:")
		+ "itempath:".length();
	int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	    return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	    return documentBuffer.substring(fragmentStart);
	}
    }

    private String extractExpCite(final String documentBuffer) {
	/**
	 * Extract the expcite metadata from the <document>.
	 */

	// Get the string positions.
	int fragmentStart = documentBuffer.indexOf("expcite:")
		+ "expcite:".length();
	int fragmentEnd = documentBuffer.indexOf(" -->", fragmentStart);

	// Return the substring
	if (fragmentEnd > 0) {
	    return documentBuffer.substring(fragmentStart, fragmentEnd);
	} else {
	    return documentBuffer.substring(fragmentStart);
	}
    }

    public void parseDocument(final String documentBuffer) {
	/**
	 * This method parses an individual <document> from larger XHTML LRC
	 * document.
	 */

	// Get the document-level metadata.
	String documentID = extractDocumentID(documentBuffer);
	String uscKey = extractUSCKey(documentBuffer);
	String currentThrough = extractCurrentThrough(documentBuffer);
	String itemPath = extractItemPath(documentBuffer);
	String expCite = extractExpCite(documentBuffer);

	int fieldStart = 0, fieldEnd = 0;

	// Now split the document into fields.
	String fieldString = "", textHead = "", textStatute = "";
	fieldStart = documentBuffer.indexOf("<!-- field-start");

	while (fieldStart > 0) {
	    fieldEnd = documentBuffer.indexOf("<!-- field-end", fieldStart + 1);

	    // Store the substring.
	    if (fieldEnd > 0) {
		fieldString = documentBuffer.substring(fieldStart, fieldEnd);
	    } else {
		fieldString = documentBuffer.substring(fieldStart);
	    }

	    // Parse the fields depending on type.
	    if (fieldString.contains("field-start:head")) {
		textHead = extractFieldText(fieldString);
	    } else if (fieldString.contains("field-start:statute")) {
		textStatute = extractFieldText(fieldString);
	    }

	    // Find the next field.
	    if (fieldEnd > 0) {
		fieldStart = documentBuffer.indexOf("<!-- field-start",
			fieldEnd);
	    } else {
		fieldStart = -1;
	    }
	}

	try {
	    indexDocument(documentID, uscKey, currentThrough, itemPath,
		    textHead, textStatute);
	} catch (Exception E) {
	    E.printStackTrace();
	}
    }

    private String extractFieldText(final String fieldBuffer) {
	/**
	 * Parse the field buffer and return the text.
	 */

	// Create the parser and visitor.
	Parser htmlParser = Parser.createParser(fieldBuffer, "UTF-8");
	TextExtractingVisitor textVisitor = new TextExtractingVisitor();

	try {
	    // Now try to parse the string and store the text.
	    htmlParser.visitAllNodesWith(textVisitor);
	    return textVisitor.getExtractedText().trim();
	} catch (Exception E) {
	    return "";
	}
    }

    private void indexDocument(String documentID, String uscKey,
	    String currentThrough, String itemPath, String head, String text)
	    throws IOException {
	/**
	 * This method actually writes the document into the index.
	 */

	if ((text.length() == 0) || (documentID.length() == 0)) {
	    return;
	}

	// Create document.
	Document doc = new Document();
	doc.add(new Field("documentid", documentID, Field.Store.YES,
		Field.Index.NOT_ANALYZED));
	doc.add(new Field("usckey", uscKey, Field.Store.YES,
		Field.Index.NOT_ANALYZED));
	doc.add(new Field("currentthrough", currentThrough, Field.Store.YES,
		Field.Index.NOT_ANALYZED));
	doc.add(new Field("itempath", itemPath, Field.Store.YES,
		Field.Index.ANALYZED));
	doc.add(new Field("head", head, Field.Store.YES, Field.Index.ANALYZED,
		Field.TermVector.YES));
	doc.add(new Field("text", text, Field.Store.NO, Field.Index.ANALYZED,
		Field.TermVector.YES));

	// Write into index.
	indexWriter.addDocument(doc);
    }
}

/**
 * This class is the driver that processes ZIP files to construct a CodeIndex.
 */
public class buildCodeIndex {
    // CodeIndex object
    private static CodeIndex codeIndex;

    public static void main(final String[] args) {
	// Check that a proper command line is passed and fail if not.
	if (args.length == 0) {
	    System.err.println("Usage: buildCodeIndex <path to ZIP files>");
	    System.exit(-1);
	}

	// Now check that the argument is a directory that exists.
	File directory = new File(args[0]);
	if (!directory.exists() && directory.isDirectory()) {
	    System.err.println("Usage: buildCodeIndex <path to ZIP files>");
	    System.err
		    .println("The specified path to ZIP files does not exists.");
	    System.exit(-1);
	}

	try {
	    // Create the CodeIndex object.
	    codeIndex = new CodeIndex("index/");

	    // Start the indexing by processing the directory.
	    processDirectory(directory);

	    // Shut down the index.
	    codeIndex.close();
	} catch (Exception E) {
	    E.printStackTrace();
	    System.exit(-1);
	}
    }

    /**
     * This method processes the ZIP files in a given directory. These ZIP files
     * should be obtained from this URL: http://uscode.house.gov/xhtml/
     * 
     * @param directoryPath
     *            : path to the directory that contains the U.S. Code ZIP files.
     */
    private static void processDirectory(final File directory) {
	// Build the sorted list of files
	File[] fileList = directory.listFiles();
	java.util.Arrays.sort(fileList);

	// Iterate over all files
	for (File f : fileList) {
	    // Check if this is a ZIP file and process it if so.
	    if (f.getName().toLowerCase().endsWith(".zip")) {
		System.out.println("Processing " + f.getAbsolutePath());
		processZIP(f.getAbsolutePath());
	    }
	}
    }

    /**
     * This method process a specific ZIP file that should contain the XHTML
     * files from the Law Revision Counsel at the U.S. House of Representatives.
     * 
     * @param fileName
     *            : File name of the ZIP to be processed.
     */
    private static void processZIP(final String fileName) {
	// Try to open the ZIP file and return if we can't.
	final ZipFile zipFile;

	try {
	    zipFile = new ZipFile(fileName);
	} catch (Exception E) {
	    E.printStackTrace();
	    return;
	}

	// Now iterate over all entries in the ZIP file and parse the HTML ones.
	for (final Enumeration<? extends ZipEntry> entryList = zipFile
		.entries(); entryList.hasMoreElements();) {
	    // Get the current entry
	    ZipEntry entry = entryList.nextElement();

	    // Now read the entry and process it if non-null.
	    String entryBuffer = readZIPEntry(zipFile, entry);
	    if (entryBuffer != null) {
		codeIndex.parseHTML(entryBuffer);
	    }
	}
    }

    /**
     * This method reads a ZIP entry and returns a String representation of the
     * buffer.
     * 
     * @param zipFile
     *            : ZipFile containing the ZipEntry to be read.
     * @param zipEntry
     *            : ZipEntry to be read.
     */
    private static String readZIPEntry(final ZipFile zipFile,
	    final ZipEntry zipEntry) {
	InputStream inputStream;
	InputStreamReader inputStreamReader;
	StringWriter stringBuffer = new StringWriter();

	// Read the entry data from the ZIP
	try {
	    // Create the InputStream objects.
	    inputStream = zipFile.getInputStream(zipEntry);
	    inputStreamReader = new InputStreamReader(inputStream);

	    // Read buffer-sized chunks into the StringWriter.
	    char[] buffer = new char[16384];
	    while (inputStreamReader.read(buffer, 0, buffer.length) != -1) {
		stringBuffer.append(String.valueOf(buffer));
	    }

	    // Close the InputStream objects.
	    inputStreamReader.close();
	    inputStream.close();

	    // Return the string buffer.
	    return stringBuffer.toString();
	} catch (Exception E) {
	    // Handle the exception.
	    E.printStackTrace();
	    return null;
	}
    }
}

buildCodeIndex.java — Build a Lucene index from U.S. Code XHTML.

Once the index is built, a simple single-term search interface (searchCodeIndex.java) allows querying. A search for "swap" across the entire Code returns top results including sections on swap dealer registration, swap recordkeeping, swap execution facilities, and swap data repositories — all highly relevant to post-Dodd-Frank compliance.

$ mvn -q exec:java -Dexec.mainClass="org.mjb.searchCodeIndex" -Dexec.args="text swap"
documentid:7 U.S.C. 6s
currentthrough:20110107
score:2.2053032
itempath:
Title 7
CHAPTER 1
>§6s. Registration and regulation of swap dealers and major swap participants

documentid:7 U.S.C. 6r
currentthrough:20110107
score:2.0396917
itempath:
Title 7
CHAPTER 1
>§6r. Reporting and recordkeeping for uncleared swaps

documentid:7 U.S.C. 7b-3
currentthrough:20110107
score:1.7781076
itempath:
Title 7
CHAPTER 1
>§7b–3. Swap execution facilities

documentid:7 U.S.C. 24a
currentthrough:20110107
score:1.6279716
itempath:
Title 7
CHAPTER 1
>§24a. Swap data repositories

documentid:15 U.S.C. 77b-1
currentthrough:20100201
score:1.5701554
itempath:
Title 15
CHAPTER 2A
>SUBCHAPTER I
>>§77b–1. Swap agreements

Sample search output for the query "swap" across the U.S. Code index.

/**
 * @author Michael J Bommarito II
 * @date Apr 9, 2011
 * @license MIT, (C) Michael J Bommarito II 2011
 */
package org.mjb;

// Java standard library imports
import java.io.*;

// Lucene imports
import org.apache.lucene.document.Document;
import org.apache.lucene.index.*;
import org.apache.lucene.search.*;
import org.apache.lucene.store.*;

public class searchCodeIndex {
    // Lucene index searcher
    private static Searcher indexSearcher;

    public static void main(String[] args) {
	// Check if the proper command line was passed.
	if (args.length != 2) {
	    System.err.println("Usage: searchCodeIndex <field> <term>");
	    System.exit(-1);
	}

	// Load the index
	loadIndex("index/");

	// Setup the query
	Term term = new Term(args[0], args[1]);
	Query termQuery = new TermQuery(term);

	try {
	    // Execute the search and iterate over the top documents.
	    TopDocs topDocs = indexSearcher.search(termQuery, 5);
	    ScoreDoc[] scoreDosArray = topDocs.scoreDocs;
	    for (ScoreDoc scoredoc : scoreDosArray) {
		// Retrieve the matched document and show relevant details
		Document doc = indexSearcher.doc(scoredoc.doc);

		// Output basic document information
		String documentID = doc.getField("documentid").stringValue();
		System.out.println("documentid:"
			+ documentID.replace("_", " U.S.C. "));
		System.out.println("currentthrough:"
			+ doc.getField("currentthrough").stringValue());
		System.out.println("score:" + scoredoc.score);

		// Output the path information for the document
		String[] paths = doc.getField("itempath").stringValue()
			.split("/");
		String tabBuffer = "";
		System.out.println("itempath:");
		System.out.println("Title "
			+ Integer.valueOf(documentID.split("_")[0]));
		for (int i = 2; i < paths.length - 1; i++) {
		    System.out.println(tabBuffer + paths[i]);
		    tabBuffer += ">";
		}
		System.out.println(tabBuffer
			+ doc.getField("head").stringValue());
		System.out.println();
	    }
	} catch (Exception E) {
	    E.printStackTrace();
	}
    }

    private static void loadIndex(String indexPath) {
	// Load the index searcher
	try {
	    Directory indexDir = FSDirectory.open(new File(indexPath));
	    indexSearcher = new IndexSearcher(indexDir);
	} catch (Exception E) {
	    E.printStackTrace();
	}
    }

}

searchCodeIndex.java — Search a Lucene index of the U.S. Code.

legal-tech search-engine java natural-language-processing open-source

Building a Better Legal Search Engine, Part 1: Searching the U.S. Code

More Insights

Building Legal Language Explorer: Interactivity and Drill-Down, noSQL and SQL

Predicting the Supreme Court: AI Meets Legal Outcomes

Course Material for Complex Systems 530 — Computer Modeling for Complex Systems

Let's Work Together