Difference between revisions of "TreeBASE Data"
(→Dump Contents) |
(→Example Queries) |
||
Line 31: | Line 31: | ||
==Example Queries== | ==Example Queries== | ||
+ | |||
+ | |||
+ | <source lang="sql"> | ||
+ | |||
+ | -- queries for a list of trees that contain any kind of bird | ||
+ | |||
+ | SELECT DISTINCT ON (tbt.tree_id) tbt.tree_label, tbt.tree_title | ||
+ | FROM trees tbt JOIN nodes tbn ON (tbt.tree_id = tbn.tree_id) | ||
+ | JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) | ||
+ | JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) | ||
+ | JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id), | ||
+ | ncbi_names hna NATURAL JOIN ncbi_nodes hno | ||
+ | WHERE nno.left_id >= hno.left_id | ||
+ | AND nna.name_class = 'scientific name' | ||
+ | AND nno.left_id < hno.right_id | ||
+ | AND hna.name_txt = 'Aves'; | ||
+ | |||
+ | -- get top ten list of trees with birds in them | ||
+ | -- each listed by how many birds in each tree | ||
+ | |||
+ | SELECT tbt.tree_id AS "TreeBASE Tree ID", MAX(tbt.tree_label) AS "Tree Label", MAX(tbt.tree_title) AS "Tree Title", COUNT(tbn.tree_id) AS "Number of Birds In Tree" | ||
+ | FROM trees tbt JOIN nodes tbn ON (tbt.tree_id = tbn.tree_id) | ||
+ | JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) | ||
+ | JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) | ||
+ | JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id), | ||
+ | ncbi_names hna NATURAL JOIN ncbi_nodes hno | ||
+ | WHERE nno.left_id >= hno.left_id | ||
+ | AND nna.name_class = 'scientific name' | ||
+ | AND nno.left_id < hno.right_id | ||
+ | AND hna.name_txt = 'Aves' | ||
+ | GROUP BY tbt.tree_id | ||
+ | ORDER BY COUNT(tbn.tree_id) DESC | ||
+ | LIMIT 10; | ||
+ | |||
+ | -- list the trees that contain any kind of bird, | ||
+ | -- any kind of crocodyle, but not any kind of mammal | ||
+ | |||
+ | SELECT t.tree_id, t.tree_label, t.tree_title | ||
+ | FROM trees t | ||
+ | WHERE EXISTS ( | ||
+ | SELECT 1 | ||
+ | FROM nodes tbn JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) | ||
+ | JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) | ||
+ | JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id) | ||
+ | JOIN ncbi_nodes inco ON (nno.left_id BETWEEN inco.left_id AND inco.right_id), | ||
+ | ncbi_names inca | ||
+ | WHERE inco.tax_id = inca.tax_id | ||
+ | AND inca.name_class = 'scientific name' | ||
+ | AND inca.name_txt = 'Aves' | ||
+ | AND tbn.tree_id = t.tree_id | ||
+ | ) | ||
+ | AND EXISTS ( | ||
+ | SELECT 1 | ||
+ | FROM nodes tbn JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) | ||
+ | JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) | ||
+ | JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id) | ||
+ | JOIN ncbi_nodes inco ON (nno.left_id BETWEEN inco.left_id AND inco.right_id), | ||
+ | ncbi_names inca | ||
+ | WHERE inco.tax_id = inca.tax_id | ||
+ | AND inca.name_class = 'scientific name' | ||
+ | AND inca.name_txt = 'Crocodylidae' | ||
+ | AND tbn.tree_id = t.tree_id | ||
+ | ) | ||
+ | AND NOT EXISTS ( | ||
+ | SELECT 1 | ||
+ | FROM nodes tbn JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) | ||
+ | JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) | ||
+ | JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id) | ||
+ | JOIN ncbi_nodes inco ON (nno.left_id BETWEEN inco.left_id AND inco.right_id), | ||
+ | ncbi_names inca | ||
+ | WHERE inco.tax_id = inca.tax_id | ||
+ | AND inca.name_class = 'scientific name' | ||
+ | AND inca.name_txt = 'Mammalia' | ||
+ | AND tbn.tree_id = t.tree_id | ||
+ | ) | ||
+ | ; | ||
+ | |||
+ | |||
+ | -- here is an example of a topological query that takes advantage of the transitive closure | ||
+ | -- (in the node_path table) for all the trees in the database. In this case we are asking | ||
+ | -- for all trees that match the pattern ((Hippopotamus_amphibius, Bos_taurus), Sus_scrofa). | ||
+ | -- Meaning that the hippo and the cow are more closely related than either is to the pig | ||
+ | |||
+ | SELECT t.tree_id, t.tree_label, t.tree_title | ||
+ | FROM trees t | ||
+ | INNER JOIN | ||
+ | (SELECT DISTINCT ON (inN.tree_id) inP.parent_node_id, inN.tree_id | ||
+ | FROM nodes inN JOIN tb_labels tbl ON (inN.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id), node_path inP | ||
+ | WHERE txa.namestring IN ('Hippopotamus amphibius','Bos taurus') | ||
+ | AND inP.child_node_id = inN.node_id | ||
+ | GROUP BY inN.tree_id, inP.parent_node_id | ||
+ | HAVING COUNT(inP.child_node_id) = 2 | ||
+ | ORDER BY inN.tree_id, inP.parent_node_id DESC) AS lca | ||
+ | USING (tree_id) | ||
+ | WHERE NOT EXISTS ( | ||
+ | -- make sure that inP.parent_node_id is not a parent to Sus scrofa | ||
+ | SELECT 1 | ||
+ | FROM nodes outN JOIN tb_labels tbl ON (outN.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id), node_path outP | ||
+ | WHERE txa.namestring IN ('Sus scrofa') | ||
+ | AND outP.child_node_id = outN.node_id | ||
+ | AND outP.parent_node_id = lca.parent_node_id) | ||
+ | AND EXISTS ( | ||
+ | -- but still make sure that the tree contains Sus scrofa | ||
+ | SELECT c.tree_id | ||
+ | FROM trees c, nodes q JOIN tb_labels tbl ON (q.taxon_id = tbl.tb_labels_id) | ||
+ | JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) | ||
+ | JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) | ||
+ | WHERE txa.namestring IN ('Sus scrofa') | ||
+ | AND q.tree_id = c.tree_id | ||
+ | AND c.tree_id = t.tree_id | ||
+ | GROUP BY c.tree_id | ||
+ | HAVING COUNT(c.tree_id) = 1); | ||
+ | |||
+ | |||
+ | </source> |
Revision as of 11:20, 10 March 2009
TreeBASE Dump
A Postgres dump for TreeBASE can be obtained [here].
Dump Contents
List of relations Schema | Name | Type | Owner --------+----------------------+----------+------- public | edges | table | piel public | ncbi_names | table | piel public | ncbi_nodes | table | piel public | node_path | table | piel public | nodes | table | piel public | nodes_node_id | sequence | piel public | study | table | piel public | study_id_seq | sequence | piel public | taxa | table | piel public | taxon_id_seq | sequence | piel public | taxon_variant_id_seq | sequence | piel public | taxon_variants | table | piel public | tb_labels | table | piel public | tb_labels_id | sequence | piel public | trees | table | piel public | trees_tree_id | sequence | piel (16 rows)
For each "study" record, there are many "trees" records. Each "trees" record has many "nodes" records, which are wired to each other via the "edges" table and a transitive closure index is in the "node_path" table. Each "tb_labels" record can point to many "nodes" records -- "tb_labels" is a table of unique taxon labels that appear in all trees. Each taxon_variant record maps to zero or more tb_labels; each "taxa" record maps to one or more taxon_variant records. Each "taxa" record represents a single, normalized taxon, usually a species, but could be a subspecies or a higher taxon. Wherever possible, each "taxa" record has an ncbi_taxid -- that is, the IDs used by ncbi in their Genbank distribution. Consequently, these taxids connect the "ncbi_names" table, which in turn uses the "ncbi_nodes" table as a hierarchical classification. This classification has been pre-indexed with left and right IDs so that hierarchical searching is possible.
Example Queries
<source lang="sql">
-- queries for a list of trees that contain any kind of bird
SELECT DISTINCT ON (tbt.tree_id) tbt.tree_label, tbt.tree_title FROM trees tbt JOIN nodes tbn ON (tbt.tree_id = tbn.tree_id) JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id), ncbi_names hna NATURAL JOIN ncbi_nodes hno WHERE nno.left_id >= hno.left_id AND nna.name_class = 'scientific name' AND nno.left_id < hno.right_id AND hna.name_txt = 'Aves';
-- get top ten list of trees with birds in them -- each listed by how many birds in each tree
SELECT tbt.tree_id AS "TreeBASE Tree ID", MAX(tbt.tree_label) AS "Tree Label", MAX(tbt.tree_title) AS "Tree Title", COUNT(tbn.tree_id) AS "Number of Birds In Tree" FROM trees tbt JOIN nodes tbn ON (tbt.tree_id = tbn.tree_id) JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id), ncbi_names hna NATURAL JOIN ncbi_nodes hno WHERE nno.left_id >= hno.left_id AND nna.name_class = 'scientific name' AND nno.left_id < hno.right_id AND hna.name_txt = 'Aves' GROUP BY tbt.tree_id ORDER BY COUNT(tbn.tree_id) DESC LIMIT 10;
-- list the trees that contain any kind of bird, -- any kind of crocodyle, but not any kind of mammal
SELECT t.tree_id, t.tree_label, t.tree_title FROM trees t WHERE EXISTS (
SELECT 1 FROM nodes tbn JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id) JOIN ncbi_nodes inco ON (nno.left_id BETWEEN inco.left_id AND inco.right_id), ncbi_names inca WHERE inco.tax_id = inca.tax_id AND inca.name_class = 'scientific name' AND inca.name_txt = 'Aves' AND tbn.tree_id = t.tree_id )
AND EXISTS (
SELECT 1 FROM nodes tbn JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id) JOIN ncbi_nodes inco ON (nno.left_id BETWEEN inco.left_id AND inco.right_id), ncbi_names inca WHERE inco.tax_id = inca.tax_id AND inca.name_class = 'scientific name' AND inca.name_txt = 'Crocodylidae' AND tbn.tree_id = t.tree_id )
AND NOT EXISTS (
SELECT 1 FROM nodes tbn JOIN tb_labels tbl ON (tbn.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) JOIN ncbi_names nna ON (txa.taxid = nna.tax_id) JOIN ncbi_nodes nno ON (nna.tax_id = nno.tax_id) JOIN ncbi_nodes inco ON (nno.left_id BETWEEN inco.left_id AND inco.right_id), ncbi_names inca WHERE inco.tax_id = inca.tax_id AND inca.name_class = 'scientific name' AND inca.name_txt = 'Mammalia' AND tbn.tree_id = t.tree_id )
-- here is an example of a topological query that takes advantage of the transitive closure
-- (in the node_path table) for all the trees in the database. In this case we are asking
-- for all trees that match the pattern ((Hippopotamus_amphibius, Bos_taurus), Sus_scrofa).
-- Meaning that the hippo and the cow are more closely related than either is to the pig
SELECT t.tree_id, t.tree_label, t.tree_title
FROM trees t INNER JOIN (SELECT DISTINCT ON (inN.tree_id) inP.parent_node_id, inN.tree_id FROM nodes inN JOIN tb_labels tbl ON (inN.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id), node_path inP WHERE txa.namestring IN ('Hippopotamus amphibius','Bos taurus') AND inP.child_node_id = inN.node_id GROUP BY inN.tree_id, inP.parent_node_id HAVING COUNT(inP.child_node_id) = 2 ORDER BY inN.tree_id, inP.parent_node_id DESC) AS lca USING (tree_id) WHERE NOT EXISTS ( -- make sure that inP.parent_node_id is not a parent to Sus scrofa SELECT 1 FROM nodes outN JOIN tb_labels tbl ON (outN.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id), node_path outP WHERE txa.namestring IN ('Sus scrofa') AND outP.child_node_id = outN.node_id AND outP.parent_node_id = lca.parent_node_id) AND EXISTS ( -- but still make sure that the tree contains Sus scrofa SELECT c.tree_id FROM trees c, nodes q JOIN tb_labels tbl ON (q.taxon_id = tbl.tb_labels_id) JOIN taxon_variants txv ON (tbl.taxon_variant_id = txv.taxon_variant_id) JOIN taxa txa ON (txv.taxon_id = txa.taxon_id) WHERE txa.namestring IN ('Sus scrofa') AND q.tree_id = c.tree_id AND c.tree_id = t.tree_id GROUP BY c.tree_id HAVING COUNT(c.tree_id) = 1);
</source>