%% This BibTeX bibliography file was created using BibDesk.
%% http://bibdesk.sourceforge.net/


%% Created for pereira at 2006-11-26 19:37:35 -0500 


%% Saved with string encoding Western (ASCII) 



@inproceedings{Dredze+al-06:attachment,
	Address = {Stanford, CA},
	Author = {Mark Dredze and John Blitzer and Fernando Pereira},
	Booktitle = {3rd Conference on Email and Anti-Spam},
	Date-Added = {2006-11-23 15:18:50 -0500},
	Date-Modified = {2006-11-23 15:27:56 -0500},
	Title = {"Sorry I forgot the attachment": Email Attachment Prediction},
	Url = {PDF/ceas06.pdf},
	Year = {2006}}

@inproceedings{Dredze+al-06:reply,
	Address = {Stanford, CA},
	Author = {Mark Dredze and John Blitzer and Fernando Pereira},
	Booktitle = {2nd Conference on Email and Anti-Spam},
	Date-Added = {2006-11-23 15:16:15 -0500},
	Date-Modified = {2006-11-23 15:28:00 -0500},
	Title = {Reply Expectation Prediction for Email Management},
	Url = {PDF/ceas05.pdf},
	Year = {2005}}

@inproceedings{BenDavid+al-06:analysis,
	Address = {Cambridge, MA},
	Author = {Shai Ben-David and John Blitzer and Koby Crammer and Fernando Pereira},
	Booktitle = {Advances in Neural Information Processing Systems 20},
	Date-Added = {2006-11-23 15:12:42 -0500},
	Date-Modified = {2006-11-23 15:27:55 -0500},
	Publisher = {MIT Press},
	Title = {Analysis of Representations for Domain Adaptation},
	Url = {PDF/nips06.pdf},
	Year = {2007}}

@inproceedings{Blitzer+al-04:hierarchical,
	Address = {Cambridge, MA},
	Author = {John Blitzer and Kilian Weinberger and Lawrence Saul and Fernando Pereira},
	Booktitle = {Advances in Neural Information Processing Systems 17},
	Date-Added = {2006-11-22 21:27:23 -0500},
	Date-Modified = {2006-11-23 15:28:01 -0500},
	Editor = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou},
	Month = {December},
	Publisher = {MIT Press},
	Title = {Hierarchical Distributed Representations for Statistical Language Modeling},
	Url = {PDF/nips04.pdf},
	Year = {2004}}

@inproceedings{Globerson+al-2005:embedding,
	Address = {Cambridge Massachusetts},
	Author = {Amir Globerson and Gal Chechik and Fernando Pereira and Naftali Tishby},
	Booktitle = {Advances in Neural Information Processing Systems 17},
	Date-Added = {2006-11-22 21:27:23 -0500},
	Date-Modified = {2006-11-23 15:28:01 -0500},
	Editor = {Lawrence K. Saul and Yair Weiss and {L\'{e}on} Bottou},
	Month = {December},
	Publisher = {MIT Press},
	Title = {Euclidean Embedding of Co-occurrence Data},
	Url = {PDF/CODE_final.pdf},
	Year = {2004}}

@inproceedings{Blitzer+al-05:latent,
	Author = {John Blitzer and Amir Globerson and Fernando Pereira},
	Booktitle = {Tenth International Workshop on Artificial Intelligence and Statistics},
	Date-Added = {2006-11-22 21:27:23 -0500},
	Date-Modified = {2006-11-23 15:28:00 -0500},
	Month = {January},
	Title = {Distributed Latent Variable Models of Lexical Co-occurrences},
	Url = {PDF/aistats.pdf},
	Year = {2005}}

@inproceedings{McCallum+al-00:memm,
	Address = {Stanford, California},
	Author = {Andrew McCallum and Dayne Freitag and Fernando Pereira},
	Booktitle = {Machine Learning: Proceedings of the Seventeenth International  Conference ({ICML} 2000)},
	Date-Added = {2006-11-22 20:09:37 -0500},
	Date-Modified = {2006-11-23 15:28:02 -0500},
	Pages = {591-598},
	Title = {Maximum Entropy {Markov} Models for Information Extraction and Segmentation},
	Url = {PDF/memm-icml2000.pdf},
	Year = {2000}}

@inproceedings{Lafferty+al-01:crf,
	Author = {John Lafferty and Andrew McCallum and Fernando Pereira},
	Booktitle = {Proceedings of {ICML-01}},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:28:02 -0500},
	Pages = {282-289},
	Title = {Conditional Random Fields: Probabilistic Models for Segmenting and Labeling Sequence Data},
	Url = {PDF/crf.pdf},
	Year = {2001}}

@inproceedings{Sha+Pereira-03:shallow,
	Author = {Fei Sha and Fernando Pereira},
	Booktitle = {Proceedings of {HLT-NAACL} 2003},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:28:01 -0500},
	Pages = {213-220},
	Publisher = {Association for Computational Linguistics},
	Title = {Shallow Parsing with Conditional Random Fields},
	Url = {PDF/shallow.pdf},
	Year = {2003}}

@article{McDonald+al-04:variation,
	Author = {Ryan T. McDonald and Winters, R. Scott and Mark Mandel and Yang Jin and Peter S. White and Fernando Pereira},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-22 20:08:59 -0500},
	Journal = {Bioinformatics},
	Month = {June},
	Pages = {3249 - 3251},
	Title = {An entity tagger for recognizing acquired genomic variations in cancer literature},
	Url = {http://bioinformatics.oupjournals.org/cgi/reprint/20/17/3249},
	Volume = {20},
	Year = {2004}}

@inproceedings{McAllester+al-04:casefactor,
	Author = {David McAllester and Michael Collins and Fernando Pereira},
	Booktitle = {Proceedings of the 20th Conference on Uncertainty in Artificial Intelligence},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:28:01 -0500},
	Month = {July},
	Title = {Case-Factor Diagrams for Structured Probabilistic Modeling},
	Url = {PDF/casefactor.pdf},
	Year = {2004}}

@article{Crim+al-2005:normalization,
	Author = {Jeremiah Crim and Ryan McDonald and Fernando Pereira},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-22 20:18:13 -0500},
	Journal = {{BMC} Bioinformatics},
	Month = {May},
	Number = {Suppl 1},
	Pages = {S13},
	Title = {Automatically annotating documents with normalized gene lists},
	Url = {http://www.biomedcentral.com/1471-2105/6/S1/S13},
	Volume = {6},
	Year = {2005}}

@article{McDonald+Pereira-05:genes,
	Author = {Ryan McDonald and Fernando Pereira},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:26:56 -0500},
	Journal = {{BMC} Bioinformatics},
	Month = {May},
	Number = {Suppl 1},
	Pages = {S6},
	Title = {Identifying Gene and Protein Mentions in Text Using Conditional Random Fields},
	Url = {http://www.biomedcentral.com/1471-2105/6/S1/S6},
	Volume = {6},
	Year = {2005}}

@inproceedings{McCallum+al-2005:stredit,
	Author = {Andrew McCallum and Kedar Bellare and Fernando Pereira},
	Booktitle = {Proceedings of the 21st Conference on Uncertainty in Artificial Intelligence ({UAI 2005})},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:59 -0500},
	Month = {July},
	Title = {A Conditional Random Field for Discriminatively-trained Finite-state String Edit Distance},
	Url = {PDF/crfstredit.pdf},
	Year = {2005}}

@inproceedings{McDonald+al-2005:relation,
	Author = {Ryan McDonald and Fernando Pereira and Seth Kulick and Scott Winters and Yang Jin and Pete White},
	Booktitle = {43rd Annual Meeting of the Association for Computational Linguistics ({ACL} 2005)},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:59 -0500},
	Month = {July},
	Title = {Simple Algorithms for Complex Relation Extraction with Applications to Biomedical {IE}},
	Url = {PDF/relationACL2005.pdf},
	Year = {2005}}

@inproceedings{McDonald+al-2005:dependency,
	Author = {Ryan McDonald and Koby Crammer and Fernando Pereira},
	Booktitle = {43rd Annual Meeting of the Association for Computational Linguistics ({ACL} 2005)},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:59 -0500},
	Month = {July},
	Title = {Online Large-Margin Training of Dependency Parsers},
	Url = {PDF/dependencyACL2005.pdf},
	Year = {2005}}

@inproceedings{McDonald+al-05:segmentation,
	Author = {Ryan McDonald and Koby Crammer and Fernando Pereira},
	Booktitle = {Proceedings of {HLT/EMNLP} 2005},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:58 -0500},
	Month = {October},
	Title = {Flexible Text Segmentation with Structured Multilabel Classification},
	Url = {PDF/segmentationHLT-EMNLP2005.pdf},
	Year = {2005}}

@inproceedings{McDonald+al-05:spanning,
	Author = {Ryan McDonald and Fernando Pereira and Kiril Ribarov and Jan Haji\v{c}},
	Booktitle = {Proceedings of {HLT/EMNLP} 2005},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:59 -0500},
	Month = {October},
	Title = {Non-projective Dependency Parsing using Spanning Tree Algorithms},
	Url = {PDF/nonprojectiveHLT-EMNLP2005.pdf},
	Year = {2005}}

@article{Jin+al-06:malignancy,
	Abstract = {ABSTRACT: BACKGROUND: The rapid proliferation of biomedical text makes it increasingly difficult for researchers to identify, synthesize, and utilize developed knowledge in their fields of interest. Automated information extraction procedures can assist in the acquisition and management of this knowledge. Previous efforts in biomedical text mining have focused primarily upon named entity recognition of well-defined molecular objects such as genes, but less work has been performed to identify disease-related objects and concepts. Furthermore, promise has been tempered by an inability to efficiently scale approaches in ways that minimize manual efforts and still perform with high accuracy. Here, we have applied a machine-learning approach previously successful for identifying molecular entities to a disease concept to determine if the underlying probabilistic model effectively generalizes to unrelated concepts with minimal manual intervention for model retraining. RESULTS: We developed a named entity recognizer (MTag), an entity tagger for recognizing clinical descriptions of malignancy presented in text. The application uses the machine-learning technique Conditional Random Fields with additional domain-specific features. MTag was tested with 1,010 training and 432 evaluation documents pertaining to cancer genomics. Overall, our experiments resulted in 0.85 precision, 0.83 recall, and 0.84 F-measure on the evaluation set. Compared with a baseline system using string matching of text with a neoplasm term list, MTag performed with a much higher recall rate (92.1% vs. 42.1% recall) and demonstrated the ability to learn new patterns. Application of MTag to all MEDLINE abstracts yielded the identification of 580,002 unique and 9,153,340 overall mentions of malignancy. Significantly, addition of an extensive lexicon of malignancy mentions as a feature set for extraction had minimal impact in performance. CONCLUSIONS: Together, these results suggest that the identification of disparate biomedical entity classes in free text may be extractable with high accuracy and only moderate additional effort for each new application domain.},
	Aid = {1471-2105-7-492 {$[$}pii{$]$}, 10.1186/1471-2105-7-492 {$[$}doi{$]$}},
	Author = {Jin, Yang and McDonald, Ryan T. and Lerman, Kevin and Mandel, Mark A. and Carroll, Steven and Liberman, Mark Y. and Pereira, Fernando C. and Winters, Raymond S. and White, Peter S.},
	Da = {20061108},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-22 20:08:59 -0500},
	Dep = {20061107},
	Edat = {2006/11/09 09:00},
	Issn = {1471-2105 (Electronic)},
	Jid = {100965194},
	Journal = {BMC Bioinformatics},
	Language = {ENG},
	Mhda = {2006/11/09 09:00},
	Number = {1},
	Own = {NLM},
	Pages = {492},
	Phst = {2006/07/24 {$[$}received{$]$}, 2006/11/07 {$[$}accepted{$]$}, 2006/11/07 {$[$}aheadofprint{$]$}},
	Pmid = {17090325},
	Pst = {aheadofprint},
	Pt = {JOURNAL ARTICLE},
	Pubm = {Print-Electronic},
	So = {BMC Bioinformatics. 2006 Nov 7;7(1):492.},
	Stat = {Publisher},
	Title = {Automated recognition of malignancy mentions in biomedical literature.},
	Url = {http://www.biomedcentral.com/1471-2105/7/492},
	Volume = {7},
	Year = {2006}}

@article{McDonald+al-2006:variants,
	Abstract = {The proliferation of biomedical literature makes it increasingly difficult for researchers to find and manage relevant information. However, identifying research articles containing mutation data, a requisite first step in integrating large and complex mutation data sets, is currently tedious, time-consuming and imprecise. More effective mechanisms for identifying articles containing mutation information would be beneficial both for the curation of mutation databases and for individual researchers. We developed an automated method that uses information extraction, classifier, and relevance ranking techniques to determine the likelihood of MEDLINE abstracts containing information regarding genomic variation data suitable for inclusion in mutation databases. We targeted the CDKN2A (p16) gene and the procedure for document identification currently used by CDKN2A Database curators as a measure of feasibility. A set of abstracts was manually identified from a MEDLINE search as potentially containing specific CDKN2A mutation events. A subset of these abstracts was used as a training set for a maximum entropy classifier to identify text features distinguishing "relevant" from "not relevant" abstracts. Each document was represented as a set of indicative word, word pair, and entity tagger-derived genomic variation features. When applied to a test set of 200 candidate abstracts, the classifier predicted 88 articles as being relevant; of these, 29 of 32 manuscripts in which manual curation found CDKN2A sequence variants were positively predicted. Thus, the set of potentially useful articles that a manual curator would have to review was reduced by 56%, maintaining 91% recall (sensitivity) and more than doubling precision (positive predictive value). Subsequent expansion of the training set to 494 articles yielded similar precision and recall rates, and comparison of the original and expanded trials demonstrated that the average precision improved with the larger data set. Our results show that automated systems can effectively identify article subsets relevant to a given task and may prove to be powerful tools for the broader research community. This procedure can be readily adapted to any or all genes, organisms, or sets of documents.},
	Address = {Department of Computer and Information Science, University of Pennsylvania, Philadelphia, USA.},
	Aid = {10.1002/humu.20363 {$[$}doi{$]$}},
	Au = {McDonald R and Scott Winters R and Ankuda CK and Murphy JA and Rogers AE and Pereira F and Greenblatt MS and White PS},
	Author = {McDonald, Ryan and Scott Winters, R and Ankuda, Claire K and Murphy, Joan A and Rogers, Amy E and Pereira, Fernando and Greenblatt, Marc S and White, Peter S},
	Ci = {Published 2006 Wiley-Liss, Inc.},
	Da = {20060830},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-22 20:08:59 -0500},
	Edat = {2006/07/26 09:00},
	Gr = {CA 96536/CA/NCI},
	Issn = {1098-1004 (Electronic)},
	Jid = {9215429},
	Journal = {Human Mutation},
	Jt = {Human mutation.},
	Language = {eng},
	Mhda = {2006/07/26 09:00},
	Number = {9},
	Own = {NLM},
	Pages = {957-64},
	Pl = {United States},
	Pmid = {16865690},
	Pst = {ppublish},
	Pt = {Journal Article},
	Pubm = {Print},
	Sb = {IM},
	So = {Hum Mutat. 2006 Sep;27(9):957-64.},
	Stat = {In-Process},
	Title = {An automated procedure to identify biomedical articles that contain cancer-associated gene variants},
	Url = {http://www3.interscience.wiley.com/cgi-bin/abstract/112729153/ABSTRACT},
	Volume = {27},
	Year = {2006}}

@inproceedings{Talukdar+al-2006:pattern,
	Author = {Partha Pratim Talukdar and Thorsten Brants and Mark Liberman and Fernando Pereira},
	Booktitle = {Tenth Conference on Computational Natural Language Learning (CoNLL-X)},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:56 -0500},
	Title = {A Context Pattern Induction Method for Named Entity Extraction},
	Url = {PDF/cpi_conll2006_camera.pdf},
	Year = {2006}}

@inproceedings{McDonald+al-2006:multilingual,
	Author = {Ryan McDonald and Kevin Lerman and Fernando Pereira},
	Booktitle = {Tenth Conference on Computational Natural Language Learning ({CoNLL-X})},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:57 -0500},
	Title = {Multilingual Dependency Parsing with a Two-Stage Discriminative Parser},
	Url = {PDF/conll2006.pdf},
	Year = {2006}}

@inproceedings{Blitzer+al-2006:scl,
	Author = {John Blitzer and Ryan McDonald and Fernando Pereira},
	Booktitle = {{EMNLP} 2006: 2006 Conference on Empirical Methods in Natural Language Processing},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:57 -0500},
	Organization = {Association for Computational Linguistics},
	Pages = {120-128},
	Title = {Domain Adaptation with Structural Correspondence Learning},
	Url = {PDF/transferEMNLP06.pdf},
	Year = {2006}}

@inproceedings{McDonald+Pereira-2006:approx,
	Author = {Ryan McDonald and Fernando Pereira},
	Booktitle = {11th Conference of the European Chapter of the Association for Computational Linguistics: {EACL} 2006},
	Date-Added = {2006-11-22 20:08:59 -0500},
	Date-Modified = {2006-11-23 15:27:58 -0500},
	Organization = {Association for Computational Linguistics},
	Pages = {81-88},
	Title = {Online Learning of Approximate Dependency Parsing Algorithms},
	Url = {PDF/approxDepEACL2006.pdf},
	Year = {2006}}

@comment{BibDesk Static Groups{
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE plist PUBLIC "-//Apple Computer//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
<plist version="1.0">
<array>
	<dict>
		<key>group name</key>
		<string>BioTagger</string>
		<key>keys</key>
		<string>McDonald+al-04:variation,Jin+al-06:malignancy,McDonald+Pereira-05:genes</string>
	</dict>
	<dict>
		<key>group name</key>
		<string>CALO</string>
		<key>keys</key>
		<string>Dredze+al-06:attachment,BenDavid+al-06:analysis,Blitzer+al-2006:scl,Blitzer+al-05:latent,Dredze+al-06:reply</string>
	</dict>
	<dict>
		<key>group name</key>
		<string>MSTParser</string>
		<key>keys</key>
		<string>McDonald+al-2006:multilingual,McDonald+Pereira-2006:approx,McDonald+al-05:spanning,McDonald+al-2005:dependency</string>
	</dict>
</array>
</plist>
}}
