Skip to content

Commit

Permalink
Switch code from Sergio's version to pmahoney's version. New code doe…
Browse files Browse the repository at this point in the history
…sn't have a two-layer design, and is much different in this point. New code uses nekodtd, so nekodtd.jar is newly added.
  • Loading branch information
yokolet committed Apr 1, 2010
1 parent 32f2328 commit 96df126
Show file tree
Hide file tree
Showing 68 changed files with 4,964 additions and 2,693 deletions.
4 changes: 2 additions & 2 deletions Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ java = RUBY_PLATFORM =~ /java/
GENERATED_PARSER = "lib/nokogiri/css/generated_parser.rb"
GENERATED_TOKENIZER = "lib/nokogiri/css/generated_tokenizer.rb"

EXTERNAL_JAVA_LIBRARIES = %w{isorelax jing nekohtml xercesImpl}.map{|x| "lib/#{x}.jar"}
EXTERNAL_JAVA_LIBRARIES = %w{isorelax jing nekohtml nekodtd xercesImpl}.map{|x| "lib/#{x}.jar"}
JAVA_EXT = "lib/nokogiri/nokogiri.jar"
JRUBY_HOME = Config::CONFIG['prefix']

Expand Down Expand Up @@ -138,7 +138,7 @@ namespace :java do
task :build_external do
Dir.chdir('ext/java') do
LIB_DIR = '../../lib'
CLASSPATH = "#{JRUBY_HOME}/lib/jruby.jar:#{LIB_DIR}/nekohtml.jar:#{LIB_DIR}/xercesImpl.jar:#{LIB_DIR}/isorelax.jar:#{LIB_DIR}/jing.jar"
CLASSPATH = "#{JRUBY_HOME}/lib/jruby.jar:#{LIB_DIR}/nekohtml.jar:#{LIB_DIR}/nekodtd.jar:#{LIB_DIR}/xercesImpl.jar:#{LIB_DIR}/isorelax.jar:#{LIB_DIR}/jing.jar"
sh "javac -g -cp #{CLASSPATH} nokogiri/*.java nokogiri/internals/*.java"
sh "jar cf ../../#{JAVA_EXT} nokogiri/*.class nokogiri/internals/*.class"
end
Expand Down
91 changes: 91 additions & 0 deletions ext/java/nokogiri/EncodingHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
package nokogiri;

import java.util.HashMap;

import org.jruby.Ruby;
import org.jruby.RubyArray;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyMethod;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.w3c.dom.Element;
import org.w3c.dom.Node;

/**
* Stub class to satisfy unit tests. I'm not sure where this class is
* meant to be used. As coded it won't really interact with any other
* classes and will have no effect on character encodings reported by
* documents being parsed.
*
* @author Patrick Mahoney <[email protected]>
*/
public class EncodingHandler extends RubyObject {
protected static HashMap<String,String> map = new HashMap<String,String>();
static {
addInitial();
}

protected String name;

protected static void addInitial() {
map.put("UTF-8", "UTF-8");
}

public EncodingHandler(Ruby ruby, RubyClass klass, String value) {
super(ruby, klass);
name = value;
}

@JRubyMethod(name="[]", meta=true)
public static IRubyObject get(ThreadContext context,
IRubyObject _klass,
IRubyObject keyObj) {
Ruby ruby = context.getRuntime();
String key = keyObj.toString();
String value = map.get(key);
if (value == null)
return ruby.getNil();

return new EncodingHandler(
ruby,
(RubyClass)ruby.getClassFromPath("Nokogiri::EncodingHandler"),
value);
}

@JRubyMethod(meta=true)
public static IRubyObject delete(ThreadContext context,
IRubyObject _klass,
IRubyObject keyObj) {
String key = keyObj.toString();
String value = map.remove(key);
if (value == null)
return context.getRuntime().getNil();
return context.getRuntime().newString(value);
}

@JRubyMethod(name="clear_aliases!", meta=true)
public static IRubyObject clear_aliases(ThreadContext context,
IRubyObject _klass) {
map.clear();
addInitial();
return context.getRuntime().getNil();
}

@JRubyMethod(meta=true)
public static IRubyObject alias(ThreadContext context,
IRubyObject _klass,
IRubyObject orig,
IRubyObject alias) {
String value = map.get(orig.toString());
if (value != null)
map.put(alias.toString(), value);

return context.getRuntime().getNil();
}

@JRubyMethod
public IRubyObject name(ThreadContext context) {
return context.getRuntime().newString(name);
}
}
102 changes: 56 additions & 46 deletions ext/java/nokogiri/HtmlDocument.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,8 @@

import java.io.IOException;
import javax.xml.parsers.ParserConfigurationException;
import nokogiri.internals.HtmlDocumentImpl;
import nokogiri.internals.HtmlEmptyDocumentImpl;
import nokogiri.internals.HtmlParseOptions;
import nokogiri.internals.ParseOptions;
import nokogiri.internals.HtmlDomParserContext;
import nokogiri.internals.SaveContext;
import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.anno.JRubyMethod;
Expand All @@ -14,76 +12,88 @@
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.w3c.dom.Document;
import org.w3c.dom.DocumentType;
import org.xml.sax.SAXException;

public class HtmlDocument extends XmlDocument {

public HtmlDocument(Ruby ruby, RubyClass klazz, Document doc) {
super(ruby, klazz, doc);
this.document = doc;
this.internalNode = new HtmlDocumentImpl(ruby, doc);
}

@JRubyMethod(name="new", meta = true, rest = true, required=0)
public static IRubyObject rbNew(ThreadContext context, IRubyObject cls, IRubyObject[] args) {
public static IRubyObject rbNew(ThreadContext context, IRubyObject cls,
IRubyObject[] args) {
HtmlDocument doc = null;
try {

/*
* A little explanation:
* I'm using an XmlDocument instead of a HTMLDocumentImpl in order
* not to have capitalized node names.
*/

Document docNode = (new ParseOptions(0)).getDocumentBuilder().newDocument();

Document docNode = createNewDocument();
doc = new HtmlDocument(context.getRuntime(), (RubyClass) cls,
docNode);
doc.internalNode = new HtmlEmptyDocumentImpl(context.getRuntime(),
docNode);
docNode);
} catch (Exception ex) {
throw context.getRuntime().newRuntimeError("couldn't create document: "+ex.toString());
throw context.getRuntime()
.newRuntimeError("couldn't create document: "+ex.toString());
}

RuntimeHelpers.invoke(context, doc, "initialize", args);

return doc;
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject read_io(ThreadContext context, IRubyObject cls, IRubyObject[] args) {
public static IRubyObject do_parse(ThreadContext context,
IRubyObject klass,
IRubyObject[] args) {
Ruby ruby = context.getRuntime();
Arity.checkArgumentCount(ruby, args, 4, 4);
HtmlDomParserContext ctx =
new HtmlDomParserContext(ruby, args[3]);
ctx.setInputSource(context, args[0]);
return ctx.parse(context, klass, args[1]);
}

IRubyObject content = RuntimeHelpers.invoke(context, args[0], "read");
args[0] = content;

return read_memory(context, cls, args);
@JRubyMethod(meta = true, rest = true)
public static IRubyObject read_io(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
return do_parse(context, cls, args);
}

@JRubyMethod(meta = true, rest = true)
public static IRubyObject read_memory(ThreadContext context, IRubyObject cls, IRubyObject[] args) {

Ruby ruby = context.getRuntime();
Arity.checkArgumentCount(ruby, args, 4, 4);
ParseOptions options = new HtmlParseOptions(args[3]);
try {
Document document;
document = options.parse(args[0].convertToString().asJavaString());
HtmlDocument doc = new HtmlDocument(ruby, (RubyClass)cls, document);
doc.setUrl(args[1]);
options.addErrorsIfNecessary(context, doc);
return doc;
} catch (ParserConfigurationException pce) {
return options.getDocumentWithErrorsOrRaiseException(context, pce);
} catch (SAXException saxe) {
return options.getDocumentWithErrorsOrRaiseException(context, saxe);
} catch (IOException ioe) {
return options.getDocumentWithErrorsOrRaiseException(context, ioe);
}
public static IRubyObject read_memory(ThreadContext context,
IRubyObject cls,
IRubyObject[] args) {
return do_parse(context, cls, args);
}


@JRubyMethod
public static IRubyObject serialize(ThreadContext context, IRubyObject htmlDoc) {
throw context.getRuntime().newNotImplementedError("not implemented");
}
}

@Override
public void saveContent(ThreadContext context, SaveContext ctx) {
Document doc = getDocument();
DocumentType dtd = doc.getDoctype();

if(dtd != null) {
ctx.append("<!DOCTYPE ");
ctx.append(dtd.getName());
if(dtd.getPublicId() != null) {
ctx.append(" PUBLIC ");
ctx.appendQuoted(dtd.getPublicId());
if(dtd.getSystemId() != null) {
ctx.append(" ");
ctx.appendQuoted(dtd.getSystemId());
}
} else if(dtd.getSystemId() != null) {
ctx.append(" SYSTEM ");
ctx.appendQuoted(dtd.getSystemId());
}
ctx.append(">\n");
}

this.saveNodeListContent(context,
(XmlNodeSet) this.children(context), ctx);
ctx.append("\n");
}
}
113 changes: 113 additions & 0 deletions ext/java/nokogiri/HtmlElementDescription.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
package nokogiri;

import org.jruby.Ruby;
import org.jruby.RubyClass;
import org.jruby.RubyObject;
import org.jruby.anno.JRubyMethod;
import org.jruby.exceptions.RaiseException;
import org.jruby.runtime.ThreadContext;
import org.jruby.runtime.builtin.IRubyObject;
import org.cyberneko.html.HTMLElements;
import org.cyberneko.html.HTMLElements.Element;

import java.util.Map;
import java.util.List;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Collections;

import static org.jruby.javasupport.util.RuntimeHelpers.invoke;

/**
* @author Patrick Mahoney <[email protected]>
*/
public class HtmlElementDescription extends RubyObject {

/**
* Stores memoized hash of element -> list of valid subelements.
*/
static protected Map<Short, List<String>> subElements;
static {
Map<Short, List<String>> _subElements =
new HashMap<Short, List<String>>();
subElements = Collections.synchronizedMap(_subElements);
}

protected HTMLElements.Element element;

public HtmlElementDescription(Ruby runtime, RubyClass rubyClass) {
super(runtime, rubyClass);
}

/**
* Lookup the list of sub elements of <code>code</code>. If not
* already stored, iterate through all elements to find valid
* subelements; save this list and return it.
*/
protected static List<String> findSubElements(HTMLElements.Element elem) {
List<String> subs = subElements.get(elem.code);

if (subs == null) {
subs = new ArrayList<String>();

/*
* A bit of a hack. NekoHtml source code shows that
* UNKNOWN is the highest value element. We cannot access
* the list of elements directly because it's protected.
*/
for (short c = 0; c < HTMLElements.UNKNOWN; c++) {
HTMLElements.Element maybe_sub =
HTMLElements.getElement(c);
if (maybe_sub.isParent(elem)) {
subs.add(maybe_sub.name);
}
}

subElements.put(elem.code, subs);
}

return subs;
}

@JRubyMethod(name="[]", meta=true)
public static IRubyObject get(ThreadContext context,
IRubyObject klazz, IRubyObject name) {

HTMLElements.Element elem = HTMLElements.getElement(name.toString());
if (elem == HTMLElements.NO_SUCH_ELEMENT)
return context.getRuntime().getNil();

HtmlElementDescription desc =
new HtmlElementDescription(context.getRuntime(), (RubyClass)klazz);
desc.element = elem;
return desc;
}

@JRubyMethod()
public IRubyObject name(ThreadContext context) {
return context.getRuntime().newString(element.name.toLowerCase());
}

@JRubyMethod(name="inline?")
public IRubyObject inline_eh(ThreadContext context) {
return context.getRuntime().newBoolean(element.isInline());
}

@JRubyMethod(name="empty?")
public IRubyObject empty_eh(ThreadContext context) {
return context.getRuntime().newBoolean(element.isEmpty());
}

@JRubyMethod()
public IRubyObject sub_elements(ThreadContext context) {
Ruby ruby = context.getRuntime();
List<String> subs = findSubElements(element);
IRubyObject[] ary = new IRubyObject[subs.size()];
for (int i = 0; i < subs.size(); ++i) {
ary[i] = ruby.newString(subs.get(i));
}

return ruby.newArray(ary);
}

}
Loading

0 comments on commit 96df126

Please sign in to comment.