(module gene-ontology mzscheme ;; This code shows how one might parse a large xml file progressively ;; by taking advantage of the lazy parsing in the sxml module. ;; ;; Input: an RDF file from the Gene Ontology (http://geneontology.org) that ;; conforms to the DTD found at: http://www.geneontology.org/dtd/go.dtd. ;; ;; Output: a sample parsing of all the terms in the RDF that ;; shows accession, name, and definition. ;; ;; I believe one can always download the latest copy of the Gene Ontology ;; by grabbing: ;; ;; http://archive.godatabase.org/latest-termdb/go_daily-termdb.rdf-xml.gz (require (planet "sxml.ss" ("lizorkin" "sxml.plt" 1 4)) (lib "list.ss") (lib "file.ss") (lib "pretty.ss")) (provide (all-defined)) ;; Our test code will just parse a term and print it out. We'll call this ;; at the end of our module. (define (test) (call-with-input-file* (vector-ref (current-command-line-arguments) 0) (lambda (ip) (fold-term-elts (lambda (term-elt acc) (void) #;(pretty-print (term-elt->Term term-elt)) #;(newline)) (void) ip)))) ;; We'll say that a Term is a ;; ;; (make-Term i n d) ;; ;; where i, n are strings. d is either a string or void. This is a ;; simplification, since the real Gene Ontology provides a lot of ;; other interesting attributes (including hierarchical data). (define-struct Term (id name definition) #f) ;; fold-lazy-sxpath-list: (X Y -> Y) Y (lazy-sxpath-listof lazy-list) -> Y ;; Given the particular kind of lazy-list given by sxml's lazy:sxpath's query, ;; does a fold across its structure. (define (fold-lazy-sxpath-list f acc lazy-list) (cond [(empty? lazy-list) acc] [else (fold-lazy-sxpath-list f (f (first lazy-list) acc) (force (second lazy-list)))])) ;; make-namespaced-tag: string string -> symbol ;; Builds up a namespaced tag from the namespace ns and the suffix. (define (make-namespaced-tag ns suffix) (string->symbol (string-append ns ":" suffix))) ;; go-tag: string -> symbol (define (go-tag suffix) (make-namespaced-tag "http://www.geneontology.org/dtds/go.dtd#" suffix)) ;; rdf-tag: string -> symbol (define (rdf-tag suffix) (make-namespaced-tag "http://www.w3.org/1999/02/22-rdf-syntax-ns#" suffix)) ;; term-elt: sxml-fragment -> Term ;; Given an sxml fragment element elt, extracts a Term. (define (term-elt->Term elt) (let ([name-query (sxpath (list (go-tag "name") "text()"))] [defn-query (sxpath (list (go-tag "definition") "text()"))] [id-query (sxpath (list (go-tag "accession") "text()"))] [first-or-void (lambda (a-list) (cond [(empty? a-list) (void)] [else (first a-list)]))]) (make-Term (first (id-query elt)) (first (name-query elt)) (first-or-void (defn-query elt))))) ;; fold-term-elts: (elt Y) Y input-port -> Y ;; Given an input port ip whose contents conform to the gene ontology ;; RDF, folds f across every term element we can find in ip, using acc ;; as the initial accumulator. (define (fold-term-elts f acc ip) (let ([doc (lazy:xml->sxml ip '())] [query (lazy:sxpath (list (go-tag "go") (rdf-tag "RDF") (go-tag "term")))]) (fold-lazy-sxpath-list f acc (query doc)))) ;; Finally, fire this test code up: #;(test))