Skip to content

Commit

Permalink
Data Ingestion Tool v1.0
Browse files Browse the repository at this point in the history
  • Loading branch information
joel-a committed Jun 16, 2014
1 parent 695bb45 commit 13f7c6d
Show file tree
Hide file tree
Showing 101 changed files with 4,870 additions and 1 deletion.
10 changes: 10 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
*~
log_*.txt
dist/
build/
logs/
work/
*.log
*.graphml
*.txt
*.zip
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
coref
=====

coref
Process a graph containing named entities and documents to group like-named entities and find coreferences within
and across documents.

Mostly Blueprints gremlin groovy code and a few scripts.
167 changes: 167 additions & 0 deletions build.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
<project name="vizlinc-ingester" default="all" basedir=".">

<description>
Build Groovy vizlinc-ingester jar
</description>

<!-- Windows pathnames -->
<property name="username" value="yourid"/>
<condition property="vizlincdb.dir.string" value="C:/Users/${username}/Documents/NetBeansProjects/vizlincdb/target">
<os family="windows"/>
</condition>

<condition property="groovy.dir.string" value="C:/Users/${username}/Desktop/Java/groovy-1.8.9">
<os family="windows"/>
</condition>

<condition property="gremlin-groovy.dir.string" value="C:/Users/${username}/Desktop/Java/gremlin-groovy-2.4.0">
<os family="windows"/>
</condition>

<condition property="launch4j.available">
<os family="windows"/>
</condition>

<condition property="launch4j.dir.string" value="C:/Program Files (x86)/Launch4j">
<os family="windows"/>
</condition>

<!-- Unix pathnames -->
<property name="userid" value="yourid"/>
<condition property="vizlincdb.dir.string" value="/home/${userid}/NetBeansProjects/vizlincdb/target">
<os family="unix"/>
</condition>

<condition property="groovy.dir.string" value="/home/${userid}/code/groovy-1.8.9">
<os family="unix"/>
</condition>

<condition property="gremlin-groovy.dir.string" value="/home/${userid}/code/gremlin-groovy-2.4.0">
<os family="unix"/>
</condition>

<!-- Set a bogus value to avoid ant errors -->
<condition property="launch4j.dir.string" value="/not/available">
<os family="unix"/>
</condition>


<!-- set global properties for this build -->
<property name="src" location="src"/>
<property name="build" location="build"/>
<property name="dist" location="dist"/>
<property name="lib" location="lib"/>
<property name="data" location="data"/>
<property name="vizlincdb.dir" location="${vizlincdb.dir.string}"/>
<property name="groovy.dir" location="${groovy.dir.string}"/>
<property name="gremlin-groovy.dir" location="${gremlin-groovy.dir.string}"/>

<property name="launch4j.dir" location="${launch4j.dir.string}"/>


<!-- jars needed to run groovy -->
<!-- Use the groovy-all version, to avoid version conflicts with jars required by other ingester pieces. -->
<fileset id="groovy-all.jars" dir="${groovy.dir}/embeddable">
<include name="*.jar"/>
</fileset>

<!-- jars needed to run Gremlin -->
<fileset id="gremlin-groovy.jars" dir="${gremlin-groovy.dir}/lib">
<include name="*.jar"/>
<exclude name="groovy*.jar"/>
</fileset>

<!-- external jars needed for ingester -->
<fileset id="ingester.jars" dir="${lib}">
<include name="*.jar"/>
</fileset>

<!-- vizlincdb required jars -->
<fileset id="vizlincdb.jars" dir="${vizlincdb.dir}">
<!-- the main vizlincdb jar, e.g. vizlincdb-x.0.SNAPSHOT.jar -->
<include name="*.jar"/>
<!-- the jars vizlincdb depends on -->
<include name="lib/*.jar"/>
</fileset>

<union id="all-needed-jars">
<resources refid="groovy-all.jars"/>
<resources refid="gremlin-groovy.jars"/>
<resources refid="ingester.jars"/>
<resources refid="vizlincdb.jars"/>
</union>

<!-- Compile using groovy, all needed jars, and the source files. -->
<path id="classpath.groovyc">
<!-- groovy source -->
<pathelement path="${src}"/>
<pathelement path="${build}"/>
<resources refid="all-needed-jars"/>
</path>

<!-- How to compile using groovy. -->
<taskdef name="groovyc"
classname="org.codehaus.groovy.ant.Groovyc"
classpathref="classpath.groovyc" />


<target name="init">
<!-- Create the build directory structure used by compile -->
<mkdir dir="${build}"/>
</target>


<target name="compile" depends="init" description="compile the source ">
<!-- Debugging classpath value:
<property name="classpath.groovyc" refid="classpath.groovyc"/>
<echo message="***classpath.groovyc=${classpath.groovyc}"/>
-->
<!-- Compile the groovy code from ${src} into ${build} -->
<groovyc srcdir="${src}" destdir="${build}"/>
</target>


<target name="dist" depends="compile"
description="generate the distribution" >
<!-- Create the distribution directory -->
<mkdir dir="${dist}/lib"/>
<mkdir dir="${dist}/data"/>

<!-- Put everything in ${build} into vizlinc-ingester.jar -->
<jar jarfile="${dist}/lib/vizlinc-ingester.jar" basedir="${build}"/>

<!-- Copy all the library dependencies. Flatten means don't create subdirs. -->
<copy todir="${dist}/lib" flatten="true">
<resources refid="all-needed-jars"/>
</copy>

<!-- Copy all data files -->
<copy todir="${dist}/data">
<fileset dir="${data}"/>
</copy>
</target>

<target name="launch4j" depends="dist" if="launch4j.available">
<taskdef name="launch4j"
classname="net.sf.launch4j.ant.Launch4jTask"
classpath="${launch4j.dir}/launch4j.jar:${launch4j.dir}/lib/xstream.jar" />
<launch4j configFile="vizlinc-ingester-launch4j.xml"/>
</target>

<target name="all" depends="dist,launch4j">
</target>

<target name="zip" depends="all">
<zip destfile="vizlinc-ingester.zip">
<zipfileset prefix="vizlinc-ingester" dir="dist" />
</zip>
</target>

<target name="clean"
description="clean up" >
<!-- Delete the ${build} and ${dist} directory trees -->
<delete dir="${build}"/>
<delete dir="${dist}"/>
</target>

</project>
Binary file added data/ner-model.ser.gz
Binary file not shown.
Binary file added data/splash-ingester.bmp
Binary file not shown.
Binary file added documentation/Building the VizLinc Ingester.docx
Binary file not shown.
Binary file added documentation/Building the VizLinc Ingester.pdf
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file added lib/commons-io-2.4.jar
Binary file not shown.
Binary file added lib/postgresql-9.2-1002.jdbc4.jar
Binary file not shown.
Binary file added lib/secondstring-20120620.jar
Binary file not shown.
Binary file added lib/stanford-ner-2014-01-04.jar
Binary file not shown.
Binary file added lib/stax2-api-3.1.1.jar
Binary file not shown.
Binary file added lib/super-csv-2.1.0.jar
Binary file not shown.
Binary file added lib/tika-app-1.4.jar
Binary file not shown.
Binary file added lib/woodstox-core-asl-4.2.0.jar
Binary file not shown.
32 changes: 32 additions & 0 deletions nyt-2007_small/1815916.xml.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
Jeb Bush Ponders Future, Not Knowing What It Holds

Jeb Bush Ponders Future, Not Knowing What It Holds

When the same old irksome question popped up recently at one of his final public events here, Gov. Jeb Bush, addressing Spanish-speaking reporters, gave an atypically dramatic answer: ''Yo no tengo futuro,'' or ''I have no future.''
His words set off round-the-world buzz, with The Daily Telegraph of London going so far as to call them ''a recognition by the Bush family that their dynastic reign in American politics is drawing to a close.''
But in fact, the question lives on. Mr. Bush's spokeswoman said last week that he made the comment jokingly, and when asked about it later in an e-mail message, Mr. Bush himself replied, ''I was misunderstood by a reporter.''
He did not elaborate, leaving the world to know only this much: Half his life after he arrived in Miami as a 27-year-old real estate salesman, Governor Bush returns here this week without the title before his name and, he insists, without knowing what his future holds.
''We're in the preface of the new book in my life and I just don't know yet,'' he told reporters last month in Tallahassee, a day after his official portrait, with a Bible and a BlackBerry in the background, was unveiled at the Governor's Mansion. ''I'm going to take some time off, hopefully do a little fishing, golfing, resting, reading, exercising. And I've got to make a living, so I'll figure it out probably in January.''
Florida, too, has some readjusting to do. After eight years in office, Mr. Bush, 53, is leaving as one of the most popular and prominent governors in state history, not least because of his relationship to President Bush (brother) and former President George Bush (son). Succeeding him is Attorney General Charlie Crist, who is Republican like Mr. Bush but otherwise starkly different.
Despite the wishful prodding of admirers, Mr. Bush has adamantly ruled out a presidential campaign of his own next year, saying that he wants only to return to Miami with his wife, Columba, and their cat, Sugar. Yet rumors about his future have burst forth as regularly as exotic species in the Everglades -- among them that he would be the next commissioner of the National Football League, run for Senate or become Senator John McCain's running mate if Mr. McCain won the Republican nomination for president in 2008.
''The presidency is out of the question at this point because of Bush fatigue,'' said Peter Schweizer, a fellow at the Hoover Institution at Stanford who wrote ''The Bushes: A Dynasty'' with his wife, Rochelle. ''But the vice presidential slot is something that's very much in play. He's a successful governor of an important state, he helps shore up relations with the social conservatives and he has the Bush money machine.''
One of Mr. Bush's former chiefs of staff has gone to work for Mr. McCain's exploratory committee, but several other former aides have signed up with Gov. Mitt Romney of Massachusetts, another probable Republican contender.
''Jeb is a policy-driven guy,'' Mr. Schweizer said. ''If he can be a vice president that plays some kind of a policy role as Cheney has, as Gore did in the Clinton administration, then Jeb Bush will be interested.''
Many assume that for now -- at least partly at the urging of his wife, described as shy and eager to be out of the public eye -- Mr. Bush will return to the private sector. He reported a net worth of $1.4 million in 2005, down from $2.4 million in 1998.
He was a partner in a major real estate development firm here until his first, unsuccessful run for governor in 1994, but Mr. Schweizer predicted that Mr. Bush might now seek out work involving the bioscience industry or the Latin American economy, both of which ''he seems particularly animated by.''
All indications notwithstanding, ardent admirers like Grover Norquist, the president of Americans for Tax Reform, are not giving up on the prospect of Mr. Bush jumping into the presidential race next year, especially if Senator Hillary Rodham Clinton of New York becomes the Democratic candidate.
''He could step in later than anybody else,'' Mr. Norquist said. ''You can run for president with the last name of Bush, even though there is and will be Bush fatigue, in a year that you're likely to be running against someone whose last name is Clinton.''
For the time being, Mr. Bush bought a car, a Chrysler 300C, and rented a $5,500-per-month, 3,949-square-foot condominium in Segovia Tower, a luxury building overlooking a golf course in lush Coral Gables.
''I have no idea what I will be doing next,'' he wrote by e-mail from Boca Chica, Fla., where he was vacationing with his parents. ''My priorities are to hang out with my beloved wife (until she can't take it anymore! :)), work out every day and figure out what I will do next with my life.''
As for the continued speculation, he wrote: ''I am flattered that all sorts of people are interested in what I am going to do and many have offered advice as well. That will all subside soon.''
Small signs suggest, however, that he will have a hard time giving up executive powers. He told reporters that while buying furniture recently, he had to stifle the urge to tell the store owner a better way of doing business -- a trait his adversaries say they will not miss.
''Bush was the type that if you did not agree with him, he really didn't have time for you,'' said State Senator Frederica Wilson, Democrat of Miami. ''He wanted you to rubber stamp every idea he had, and he wouldn't listen to reason.''
While Mr. Bush is internationally famous, Mr. Crist, who will be sworn in as governor on Tuesday, is a stranger to all outside Florida and, but for his native Tampa Bay region, not particularly well known within the state either. While Mr. Bush was ideologically driven, often making enemies in pursuit of ''big, hairy, audacious goals'' and divisive social policies, Mr. Crist seems above all a pleaser, avoiding firm opinions and promising to be ''the people's governor.''
Yet despite Mr. Bush's abrasiveness and the plunging popularity of his brother the president, he has remained well liked -- or at least respected -- to the end, a feat in a state as ethnically and politically divided as Florida. A poll last month by Quinnipiac University found that 57 percent of Floridians feel he did a ''good'' or ''great'' job as governor, compared with only 10 percent who said he had done a ''bad'' job.
Howard Simon, executive director of the American Civil Liberties Union of Florida, said the poll results reflected approval of Mr. Bush's persona more than of his policies. Mr. Simon pointed out that two major education initiatives during the governor's tenure -- a costly effort to lower class size and another to provide universal prekindergarten classes -- were passed by public referendum, over the governor's objections.
''It needs to be said that the personal appeal and likeability of Jeb Bush has led the press and the public to overlook the extremism of many of his policies,'' he said.
Several of Mr. Bush's pet initiatives in fact failed, including a school voucher program that the Florida Supreme Court found unconstitutional.
But Mr. Bush pushed through $19.3 billion in tax cuts, put an unprecedented emphasis on standardized testing in public schools, privatized thousands of government jobs and ended affirmative action in public university admissions. He also persuaded the Scripps Research Institute and other bioscience research groups to open laboratories in Florida, which he says will makethe state economy less dependent on tourism and create more high-paying jobs.
And he has appointed more than a third of the state's judges, assuring that his socially and fiscally conservative beliefs will continue to hold some sway.
While others have emoted about Mr. Bush's departure -- including his father, who wept as he described his second son's ''decency'' and ''honor'' in a speech in Tallahassee last month -- he has characteristically avoided introspection. Asked last month what he would miss most about the Governor's Mansion, he cited its beauty, its staff -- and its towels.
''Fresh towels -- all you want,'' he said. ''Here, although I've been trained to do otherwise, it's just any time I want I can have many towels.''
Loading

0 comments on commit 13f7c6d

Please sign in to comment.