From 9c1c8cb20a67692ed8460c4b4f8b458d3cbde23f Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 13:54:56 +0100 Subject: [PATCH 01/14] #164 - Introduce checkstyle - Added checkstyle profile using the rules from DKPro Core --- dkpro-jwpl-build/LICENSE.txt | 268 ++++++++++++++++++ dkpro-jwpl-build/pom.xml | 33 +++ .../main/resources/dkpro-jwpl/checkstyle.xml | 116 ++++++++ .../resources/dkpro-jwpl/version-rules.xml | 74 +++++ pom.xml | 86 ++++++ 5 files changed, 577 insertions(+) create mode 100644 dkpro-jwpl-build/LICENSE.txt create mode 100644 dkpro-jwpl-build/pom.xml create mode 100644 dkpro-jwpl-build/src/main/resources/dkpro-jwpl/checkstyle.xml create mode 100644 dkpro-jwpl-build/src/main/resources/dkpro-jwpl/version-rules.xml diff --git a/dkpro-jwpl-build/LICENSE.txt b/dkpro-jwpl-build/LICENSE.txt new file mode 100644 index 00000000..9ea00377 --- /dev/null +++ b/dkpro-jwpl-build/LICENSE.txt @@ -0,0 +1,268 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +=== brat === + +Copyright (C) 2010-2012 The brat contributors, all rights reserved. + +Permission is hereby granted, free of charge, to any person obtaining a copy of +this software and associated documentation files (the "Software"), to deal in +the Software without restriction, including without limitation the rights to +use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies +of the Software, and to permit persons to whom the Software is furnished to do +so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. + +=== JQuery SVG === + +Copyright 2007 - 2014 Keith Wood + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software +and associated documentation files (the "Software"), to deal in the Software without restriction, +including without limitation the rights to use, copy, modify, merge, publish, distribute, +sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all copies or +substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT +NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, +DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +=== JQuery JSON === + +Copyright 2009-2011 Brantley Harris +Copyright 2010–2014 Timo Tijhof + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + \ No newline at end of file diff --git a/dkpro-jwpl-build/pom.xml b/dkpro-jwpl-build/pom.xml new file mode 100644 index 00000000..2537c9ae --- /dev/null +++ b/dkpro-jwpl-build/pom.xml @@ -0,0 +1,33 @@ + + + 4.0.0 + + + org.dkpro.jwpl + dkpro-jwpl + 2.0.0-SNAPSHOT + + + dkpro-jwpl-build + DKPro JWPL - Build resources + + + true + + \ No newline at end of file diff --git a/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/checkstyle.xml b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/checkstyle.xml new file mode 100644 index 00000000..64e0fcbc --- /dev/null +++ b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/checkstyle.xml @@ -0,0 +1,116 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/version-rules.xml b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/version-rules.xml new file mode 100644 index 00000000..4ea251f7 --- /dev/null +++ b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/version-rules.xml @@ -0,0 +1,74 @@ + + + .*-RC[0-9]* + .*-alpha[0-9]* + .*-Alpha[0-9]* + .*-ALPHA[0-9]* + .*-beta[0-9]* + .*\.rc[0-9]* + .*-M[0-9]* + .*-b[0-9]+ + .*-b[0-9]+\.[0-9]+ + .*-atlassian.* + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + [0-9]{8}(\.[0-9]{6})? + + + + + + 20040902.021138 + + + + + + + ^5.* + + + + + + + .* + + + + + + + ^4.* + + + + \ No newline at end of file diff --git a/pom.xml b/pom.xml index f213f2dd..8c7be9fc 100644 --- a/pom.xml +++ b/pom.xml @@ -246,6 +246,7 @@ + dkpro-jwpl-build dkpro-jwpl-api dkpro-jwpl-datamachine dkpro-jwpl-timemachine @@ -263,7 +264,92 @@ https://github.com/dkpro/dkpro-jwpl/issues + + + + + org.codehaus.mojo + versions-maven-plugin + 2.14.2 + + file:${session.executionRootDirectory}/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/version-rules.xml + + + + org.dkpro.jwpl + dkpro-jwpl-build + ${project.version} + + + + + + + + + checkstyle + + + src + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + + + + + + org.apache.maven.plugins + maven-checkstyle-plugin + 3.3.0 + true + + + org.dkpro.jwpl + dkpro-jwpl-build + ${project.version} + + + com.puppycrawl.tools + checkstyle + 10.12.3 + + + + + ${project.compileSourceRoots} + ${project.testCompileSourceRoots} + dkpro-jwpl/checkstyle.xml + basedir=${project.basedir} + true + true + true + false + false + true + 0 + error + + + + checkstyle-check + verify + + check + + + + + + + + + rat-check From ebb2d7334364caac69e88bf38fbb0f53cb0f5dec Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:01:53 +0100 Subject: [PATCH 02/14] #164 - Introduce checkstyle - Add style config template and script to install it --- .../dkpro-jwpl/DKProStyle_latest.xml | 382 +++++++++++++++++ .../eclipse/org.eclipse.jdt.core.prefs | 390 ++++++++++++++++++ .../eclipse/org.eclipse.jdt.ui.prefs | 76 ++++ installEclipseSettings.sh | 24 ++ 4 files changed, 872 insertions(+) create mode 100644 dkpro-jwpl-build/src/main/resources/dkpro-jwpl/DKProStyle_latest.xml create mode 100644 dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.core.prefs create mode 100644 dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.ui.prefs create mode 100755 installEclipseSettings.sh diff --git a/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/DKProStyle_latest.xml b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/DKProStyle_latest.xml new file mode 100644 index 00000000..b73afde4 --- /dev/null +++ b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/DKProStyle_latest.xml @@ -0,0 +1,382 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.core.prefs b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.core.prefs new file mode 100644 index 00000000..9b729532 --- /dev/null +++ b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.core.prefs @@ -0,0 +1,390 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled +org.eclipse.jdt.core.compiler.codegen.targetPlatform=11 +org.eclipse.jdt.core.compiler.compliance=11 +org.eclipse.jdt.core.compiler.problem.assertIdentifier=error +org.eclipse.jdt.core.compiler.problem.enablePreviewFeatures=disabled +org.eclipse.jdt.core.compiler.problem.enumIdentifier=error +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.problem.reportPreviewFeatures=warning +org.eclipse.jdt.core.compiler.processAnnotations=enabled +org.eclipse.jdt.core.compiler.release=disabled +org.eclipse.jdt.core.compiler.source=11 +org.eclipse.jdt.core.formatter.align_assignment_statements_on_columns=false +org.eclipse.jdt.core.formatter.align_fields_grouping_blank_lines=2147483647 +org.eclipse.jdt.core.formatter.align_type_members_on_columns=false +org.eclipse.jdt.core.formatter.align_variable_declarations_on_columns=false +org.eclipse.jdt.core.formatter.align_with_spaces=false +org.eclipse.jdt.core.formatter.alignment_for_additive_operator=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_allocation_expression=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_annotation=0 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_enum_constant=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_explicit_constructor_call=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_method_invocation=16 +org.eclipse.jdt.core.formatter.alignment_for_arguments_in_qualified_allocation_expression=16 +org.eclipse.jdt.core.formatter.alignment_for_assertion_message=0 +org.eclipse.jdt.core.formatter.alignment_for_assignment=0 +org.eclipse.jdt.core.formatter.alignment_for_bitwise_operator=16 +org.eclipse.jdt.core.formatter.alignment_for_compact_if=16 +org.eclipse.jdt.core.formatter.alignment_for_compact_loops=16 +org.eclipse.jdt.core.formatter.alignment_for_conditional_expression=80 +org.eclipse.jdt.core.formatter.alignment_for_conditional_expression_chain=0 +org.eclipse.jdt.core.formatter.alignment_for_enum_constants=16 +org.eclipse.jdt.core.formatter.alignment_for_expressions_in_array_initializer=16 +org.eclipse.jdt.core.formatter.alignment_for_expressions_in_for_loop_header=0 +org.eclipse.jdt.core.formatter.alignment_for_logical_operator=16 +org.eclipse.jdt.core.formatter.alignment_for_method_declaration=0 +org.eclipse.jdt.core.formatter.alignment_for_module_statements=16 +org.eclipse.jdt.core.formatter.alignment_for_multiple_fields=16 +org.eclipse.jdt.core.formatter.alignment_for_multiplicative_operator=16 +org.eclipse.jdt.core.formatter.alignment_for_parameterized_type_references=0 +org.eclipse.jdt.core.formatter.alignment_for_parameters_in_constructor_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_parameters_in_method_declaration=16 +org.eclipse.jdt.core.formatter.alignment_for_record_components=16 +org.eclipse.jdt.core.formatter.alignment_for_relational_operator=0 +org.eclipse.jdt.core.formatter.alignment_for_resources_in_try=80 +org.eclipse.jdt.core.formatter.alignment_for_selector_in_method_invocation=16 +org.eclipse.jdt.core.formatter.alignment_for_shift_operator=0 +org.eclipse.jdt.core.formatter.alignment_for_string_concatenation=16 +org.eclipse.jdt.core.formatter.alignment_for_superclass_in_type_declaration=37 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_enum_declaration=37 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_record_declaration=37 +org.eclipse.jdt.core.formatter.alignment_for_superinterfaces_in_type_declaration=37 +org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_constructor_declaration=36 +org.eclipse.jdt.core.formatter.alignment_for_throws_clause_in_method_declaration=36 +org.eclipse.jdt.core.formatter.alignment_for_type_arguments=0 +org.eclipse.jdt.core.formatter.alignment_for_type_parameters=0 +org.eclipse.jdt.core.formatter.alignment_for_union_type_in_multicatch=16 +org.eclipse.jdt.core.formatter.blank_lines_after_imports=1 +org.eclipse.jdt.core.formatter.blank_lines_after_last_class_body_declaration=0 +org.eclipse.jdt.core.formatter.blank_lines_after_package=1 +org.eclipse.jdt.core.formatter.blank_lines_before_abstract_method=1 +org.eclipse.jdt.core.formatter.blank_lines_before_field=0 +org.eclipse.jdt.core.formatter.blank_lines_before_first_class_body_declaration=0 +org.eclipse.jdt.core.formatter.blank_lines_before_imports=1 +org.eclipse.jdt.core.formatter.blank_lines_before_member_type=1 +org.eclipse.jdt.core.formatter.blank_lines_before_method=1 +org.eclipse.jdt.core.formatter.blank_lines_before_new_chunk=1 +org.eclipse.jdt.core.formatter.blank_lines_before_package=0 +org.eclipse.jdt.core.formatter.blank_lines_between_import_groups=1 +org.eclipse.jdt.core.formatter.blank_lines_between_statement_group_in_switch=0 +org.eclipse.jdt.core.formatter.blank_lines_between_type_declarations=1 +org.eclipse.jdt.core.formatter.brace_position_for_annotation_type_declaration=next_line +org.eclipse.jdt.core.formatter.brace_position_for_anonymous_type_declaration=next_line +org.eclipse.jdt.core.formatter.brace_position_for_array_initializer=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_block=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_block_in_case=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_constructor_declaration=next_line +org.eclipse.jdt.core.formatter.brace_position_for_enum_constant=next_line +org.eclipse.jdt.core.formatter.brace_position_for_enum_declaration=next_line +org.eclipse.jdt.core.formatter.brace_position_for_lambda_body=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_method_declaration=next_line +org.eclipse.jdt.core.formatter.brace_position_for_record_constructor=next_line +org.eclipse.jdt.core.formatter.brace_position_for_record_declaration=next_line +org.eclipse.jdt.core.formatter.brace_position_for_switch=end_of_line +org.eclipse.jdt.core.formatter.brace_position_for_type_declaration=next_line +org.eclipse.jdt.core.formatter.comment.align_tags_descriptions_grouped=false +org.eclipse.jdt.core.formatter.comment.align_tags_names_descriptions=false +org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_block_comment=false +org.eclipse.jdt.core.formatter.comment.clear_blank_lines_in_javadoc_comment=false +org.eclipse.jdt.core.formatter.comment.count_line_length_from_starting_position=false +org.eclipse.jdt.core.formatter.comment.format_block_comments=true +org.eclipse.jdt.core.formatter.comment.format_header=false +org.eclipse.jdt.core.formatter.comment.format_html=true +org.eclipse.jdt.core.formatter.comment.format_javadoc_comments=true +org.eclipse.jdt.core.formatter.comment.format_line_comments=true +org.eclipse.jdt.core.formatter.comment.format_source_code=true +org.eclipse.jdt.core.formatter.comment.indent_parameter_description=true +org.eclipse.jdt.core.formatter.comment.indent_root_tags=true +org.eclipse.jdt.core.formatter.comment.indent_tag_description=false +org.eclipse.jdt.core.formatter.comment.insert_new_line_before_root_tags=insert +org.eclipse.jdt.core.formatter.comment.insert_new_line_between_different_tags=do not insert +org.eclipse.jdt.core.formatter.comment.insert_new_line_for_parameter=insert +org.eclipse.jdt.core.formatter.comment.line_length=100 +org.eclipse.jdt.core.formatter.comment.new_lines_at_block_boundaries=true +org.eclipse.jdt.core.formatter.comment.new_lines_at_javadoc_boundaries=true +org.eclipse.jdt.core.formatter.comment.preserve_white_space_between_code_and_line_comments=false +org.eclipse.jdt.core.formatter.compact_else_if=true +org.eclipse.jdt.core.formatter.continuation_indentation=2 +org.eclipse.jdt.core.formatter.continuation_indentation_for_array_initializer=2 +org.eclipse.jdt.core.formatter.disabling_tag=@formatter\:off +org.eclipse.jdt.core.formatter.enabling_tag=@formatter\:on +org.eclipse.jdt.core.formatter.format_guardian_clause_on_one_line=false +org.eclipse.jdt.core.formatter.format_line_comment_starting_on_first_column=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_annotation_declaration_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_constant_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_enum_declaration_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_record_header=true +org.eclipse.jdt.core.formatter.indent_body_declarations_compare_to_type_header=true +org.eclipse.jdt.core.formatter.indent_breaks_compare_to_cases=true +org.eclipse.jdt.core.formatter.indent_empty_lines=false +org.eclipse.jdt.core.formatter.indent_statements_compare_to_block=true +org.eclipse.jdt.core.formatter.indent_statements_compare_to_body=true +org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_cases=true +org.eclipse.jdt.core.formatter.indent_switchstatements_compare_to_switch=false +org.eclipse.jdt.core.formatter.indentation.size=4 +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_enum_constant=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_field=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_local_variable=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_method=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_package=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_parameter=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_after_annotation_on_type=insert +org.eclipse.jdt.core.formatter.insert_new_line_after_label=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_after_opening_brace_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_after_type_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_at_end_of_file_if_missing=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_catch_in_try_statement=insert +org.eclipse.jdt.core.formatter.insert_new_line_before_closing_brace_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_new_line_before_else_in_if_statement=insert +org.eclipse.jdt.core.formatter.insert_new_line_before_finally_in_try_statement=insert +org.eclipse.jdt.core.formatter.insert_new_line_before_while_in_do_statement=insert +org.eclipse.jdt.core.formatter.insert_space_after_additive_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_and_in_type_parameter=insert +org.eclipse.jdt.core.formatter.insert_space_after_arrow_in_switch_case=insert +org.eclipse.jdt.core.formatter.insert_space_after_arrow_in_switch_default=insert +org.eclipse.jdt.core.formatter.insert_space_after_assignment_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_at_in_annotation_type_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_bitwise_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_angle_bracket_in_type_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_brace_in_block=insert +org.eclipse.jdt.core.formatter.insert_space_after_closing_paren_in_cast=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_assert=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_case=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_after_colon_in_labeled_statement=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_allocation_expression=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_annotation=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_constructor_declaration_throws=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_constant_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_enum_declarations=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_explicitconstructorcall_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_increments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_for_inits=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_declaration_throws=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_method_invocation_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_field_declarations=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_multiple_local_declarations=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_parameterized_type_reference=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_record_components=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_superinterfaces=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_switch_case_expressions=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_arguments=insert +org.eclipse.jdt.core.formatter.insert_space_after_comma_in_type_parameters=insert +org.eclipse.jdt.core.formatter.insert_space_after_ellipsis=insert +org.eclipse.jdt.core.formatter.insert_space_after_lambda_arrow=insert +org.eclipse.jdt.core.formatter.insert_space_after_logical_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_multiplicative_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_not_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_angle_bracket_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_brace_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_bracket_in_array_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_cast=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_catch=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_for=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_if=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_parenthesized_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_record_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_switch=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_synchronized=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_try=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_opening_paren_in_while=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_postfix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_prefix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_question_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_after_question_in_wildcard=do not insert +org.eclipse.jdt.core.formatter.insert_space_after_relational_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_after_semicolon_in_try_resources=insert +org.eclipse.jdt.core.formatter.insert_space_after_shift_operator=insert +org.eclipse.jdt.core.formatter.insert_space_after_string_concatenation=insert +org.eclipse.jdt.core.formatter.insert_space_after_unary_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_additive_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_and_in_type_parameter=insert +org.eclipse.jdt.core.formatter.insert_space_before_arrow_in_switch_case=insert +org.eclipse.jdt.core.formatter.insert_space_before_arrow_in_switch_default=insert +org.eclipse.jdt.core.formatter.insert_space_before_assignment_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_at_in_annotation_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_bitwise_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_angle_bracket_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_brace_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_bracket_in_array_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_cast=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_catch=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_for=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_if=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_parenthesized_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_record_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_switch=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_synchronized=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_try=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_closing_paren_in_while=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_assert=insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_case=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_default=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_before_colon_in_labeled_statement=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_constructor_declaration_throws=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_constant_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_enum_declarations=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_explicitconstructorcall_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_increments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_for_inits=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_declaration_throws=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_method_invocation_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_field_declarations=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_multiple_local_declarations=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_record_components=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_superinterfaces=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_switch_case_expressions=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_comma_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_ellipsis=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_lambda_arrow=insert +org.eclipse.jdt.core.formatter.insert_space_before_logical_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_multiplicative_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_parameterized_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_arguments=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_angle_bracket_in_type_parameters=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_annotation_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_anonymous_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_array_initializer=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_block=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_constructor_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_constant=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_enum_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_method_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_record_constructor=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_record_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_switch=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_brace_in_type_declaration=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_bracket_in_array_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_annotation_type_member_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_catch=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_for=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_if=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_parenthesized_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_record_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_switch=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_synchronized=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_try=insert +org.eclipse.jdt.core.formatter.insert_space_before_opening_paren_in_while=insert +org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_return=insert +org.eclipse.jdt.core.formatter.insert_space_before_parenthesized_expression_in_throw=insert +org.eclipse.jdt.core.formatter.insert_space_before_postfix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_prefix_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_question_in_conditional=insert +org.eclipse.jdt.core.formatter.insert_space_before_question_in_wildcard=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_relational_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_semicolon=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_for=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_semicolon_in_try_resources=do not insert +org.eclipse.jdt.core.formatter.insert_space_before_shift_operator=insert +org.eclipse.jdt.core.formatter.insert_space_before_string_concatenation=insert +org.eclipse.jdt.core.formatter.insert_space_before_unary_operator=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_brackets_in_array_type_reference=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_braces_in_array_initializer=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_brackets_in_array_allocation_expression=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_annotation_type_member_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_constructor_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_enum_constant=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_declaration=do not insert +org.eclipse.jdt.core.formatter.insert_space_between_empty_parens_in_method_invocation=do not insert +org.eclipse.jdt.core.formatter.join_lines_in_comments=true +org.eclipse.jdt.core.formatter.join_wrapped_lines=true +org.eclipse.jdt.core.formatter.keep_annotation_declaration_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_anonymous_type_declaration_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_code_block_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_else_statement_on_same_line=false +org.eclipse.jdt.core.formatter.keep_empty_array_initializer_on_one_line=false +org.eclipse.jdt.core.formatter.keep_enum_constant_declaration_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_enum_declaration_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_if_then_body_block_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_imple_if_on_one_line=false +org.eclipse.jdt.core.formatter.keep_lambda_body_block_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_loop_body_block_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_method_body_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_record_constructor_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_record_declaration_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.keep_simple_do_while_body_on_same_line=false +org.eclipse.jdt.core.formatter.keep_simple_for_body_on_same_line=false +org.eclipse.jdt.core.formatter.keep_simple_getter_setter_on_one_line=false +org.eclipse.jdt.core.formatter.keep_simple_while_body_on_same_line=false +org.eclipse.jdt.core.formatter.keep_then_statement_on_same_line=false +org.eclipse.jdt.core.formatter.keep_type_declaration_on_one_line=one_line_never +org.eclipse.jdt.core.formatter.lineSplit=100 +org.eclipse.jdt.core.formatter.never_indent_block_comments_on_first_column=false +org.eclipse.jdt.core.formatter.never_indent_line_comments_on_first_column=false +org.eclipse.jdt.core.formatter.number_of_blank_lines_after_code_block=0 +org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_code_block=0 +org.eclipse.jdt.core.formatter.number_of_blank_lines_at_beginning_of_method_body=0 +org.eclipse.jdt.core.formatter.number_of_blank_lines_at_end_of_code_block=0 +org.eclipse.jdt.core.formatter.number_of_blank_lines_at_end_of_method_body=0 +org.eclipse.jdt.core.formatter.number_of_blank_lines_before_code_block=0 +org.eclipse.jdt.core.formatter.number_of_empty_lines_to_preserve=1 +org.eclipse.jdt.core.formatter.parentheses_positions_in_annotation=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_catch_clause=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_enum_constant_declaration=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_for_statment=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_if_while_statement=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_lambda_declaration=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_method_delcaration=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_method_invocation=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_record_declaration=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_switch_statement=common_lines +org.eclipse.jdt.core.formatter.parentheses_positions_in_try_clause=common_lines +org.eclipse.jdt.core.formatter.put_empty_statement_on_new_line=true +org.eclipse.jdt.core.formatter.tabulation.char=space +org.eclipse.jdt.core.formatter.tabulation.size=4 +org.eclipse.jdt.core.formatter.text_block_indentation=0 +org.eclipse.jdt.core.formatter.use_on_off_tags=true +org.eclipse.jdt.core.formatter.use_tabs_only_for_leading_indentations=true +org.eclipse.jdt.core.formatter.wrap_before_additive_operator=true +org.eclipse.jdt.core.formatter.wrap_before_assertion_message_operator=true +org.eclipse.jdt.core.formatter.wrap_before_assignment_operator=false +org.eclipse.jdt.core.formatter.wrap_before_bitwise_operator=true +org.eclipse.jdt.core.formatter.wrap_before_conditional_operator=true +org.eclipse.jdt.core.formatter.wrap_before_logical_operator=true +org.eclipse.jdt.core.formatter.wrap_before_multiplicative_operator=true +org.eclipse.jdt.core.formatter.wrap_before_or_operator_multicatch=true +org.eclipse.jdt.core.formatter.wrap_before_relational_operator=true +org.eclipse.jdt.core.formatter.wrap_before_shift_operator=true +org.eclipse.jdt.core.formatter.wrap_before_string_concatenation=true +org.eclipse.jdt.core.formatter.wrap_outer_expressions_when_nested=true +org.eclipse.jdt.core.javaFormatter=org.eclipse.jdt.core.defaultJavaFormatter diff --git a/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.ui.prefs b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.ui.prefs new file mode 100644 index 00000000..5265dda3 --- /dev/null +++ b/dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.ui.prefs @@ -0,0 +1,76 @@ +eclipse.preferences.version=1 +editor_save_participant_org.eclipse.jdt.ui.postsavelistener.cleanup=true +formatter_profile=_DKPro Style +formatter_settings_version=20 +sp_cleanup.add_default_serial_version_id=true +sp_cleanup.add_generated_serial_version_id=false +sp_cleanup.add_missing_annotations=true +sp_cleanup.add_missing_deprecated_annotations=true +sp_cleanup.add_missing_methods=false +sp_cleanup.add_missing_nls_tags=false +sp_cleanup.add_missing_override_annotations=true +sp_cleanup.add_missing_override_annotations_interface_methods=true +sp_cleanup.add_serial_version_id=false +sp_cleanup.always_use_blocks=true +sp_cleanup.always_use_parentheses_in_expressions=false +sp_cleanup.always_use_this_for_non_static_field_access=false +sp_cleanup.always_use_this_for_non_static_method_access=false +sp_cleanup.convert_functional_interfaces=false +sp_cleanup.convert_to_enhanced_for_loop=false +sp_cleanup.convert_to_enhanced_for_loop_if_loop_var_used=false +sp_cleanup.correct_indentation=false +sp_cleanup.format_source_code=true +sp_cleanup.format_source_code_changes_only=false +sp_cleanup.insert_inferred_type_arguments=false +sp_cleanup.lazy_logical_operator=false +sp_cleanup.make_local_variable_final=false +sp_cleanup.make_parameters_final=false +sp_cleanup.make_private_fields_final=true +sp_cleanup.make_type_abstract_if_missing_method=false +sp_cleanup.make_variable_declarations_final=false +sp_cleanup.merge_conditional_blocks=false +sp_cleanup.never_use_blocks=false +sp_cleanup.never_use_parentheses_in_expressions=true +sp_cleanup.number_suffix=false +sp_cleanup.objects_equals=false +sp_cleanup.on_save_use_additional_actions=true +sp_cleanup.organize_imports=true +sp_cleanup.precompile_regex=false +sp_cleanup.push_down_negation=false +sp_cleanup.qualify_static_field_accesses_with_declaring_class=false +sp_cleanup.qualify_static_member_accesses_through_instances_with_declaring_class=true +sp_cleanup.qualify_static_member_accesses_through_subtypes_with_declaring_class=true +sp_cleanup.qualify_static_member_accesses_with_declaring_class=false +sp_cleanup.qualify_static_method_accesses_with_declaring_class=false +sp_cleanup.remove_private_constructors=true +sp_cleanup.remove_redundant_modifiers=false +sp_cleanup.remove_redundant_semicolons=false +sp_cleanup.remove_redundant_type_arguments=false +sp_cleanup.remove_trailing_whitespaces=false +sp_cleanup.remove_trailing_whitespaces_all=true +sp_cleanup.remove_trailing_whitespaces_ignore_empty=false +sp_cleanup.remove_unnecessary_array_creation=false +sp_cleanup.remove_unnecessary_casts=false +sp_cleanup.remove_unnecessary_nls_tags=false +sp_cleanup.remove_unused_imports=true +sp_cleanup.remove_unused_local_variables=false +sp_cleanup.remove_unused_private_fields=true +sp_cleanup.remove_unused_private_members=false +sp_cleanup.remove_unused_private_methods=true +sp_cleanup.remove_unused_private_types=true +sp_cleanup.simplify_lambda_expression_and_method_ref=false +sp_cleanup.sort_members=false +sp_cleanup.sort_members_all=false +sp_cleanup.use_anonymous_class_creation=false +sp_cleanup.use_autoboxing=false +sp_cleanup.use_blocks=true +sp_cleanup.use_blocks_only_for_return_and_throw=false +sp_cleanup.use_directly_map_method=false +sp_cleanup.use_lambda=false +sp_cleanup.use_parentheses_in_expressions=false +sp_cleanup.use_this_for_non_static_field_access=false +sp_cleanup.use_this_for_non_static_field_access_only_if_necessary=true +sp_cleanup.use_this_for_non_static_method_access=false +sp_cleanup.use_this_for_non_static_method_access_only_if_necessary=true +sp_cleanup.use_unboxing=false +sp_cleanup.use_var=false diff --git a/installEclipseSettings.sh b/installEclipseSettings.sh new file mode 100755 index 00000000..aa4c7145 --- /dev/null +++ b/installEclipseSettings.sh @@ -0,0 +1,24 @@ +#/bin/sh + +# Formatter settings +JDT_CORE_PREFS="dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.core.prefs" + +# Save actions +JDT_UI_PREFS="dkpro-jwpl-build/src/main/resources/dkpro-jwpl/eclipse/org.eclipse.jdt.ui.prefs" + +function installPrefs { + mkdir -p $1/.settings/ + cp -v $JDT_CORE_PREFS $1/.settings/ + cp -v $JDT_UI_PREFS $1/.settings/ +} + +installPrefs dkpro-jwpl-api +installPrefs dkpro-jwpl-datamachine +installPrefs dkpro-jwpl-deps +installPrefs dkpro-jwpl-mwdumper +installPrefs dkpro-jwpl-parser +installPrefs dkpro-jwpl-revisionmachine +installPrefs dkpro-jwpl-timemachine +installPrefs dkpro-jwpl-tutorial +installPrefs dkpro-jwpl-util +installPrefs dkpro-jwpl-wikimachine From eec53dd2ece2d431274c1231ca58e96e20c17c6a Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:05:38 +0100 Subject: [PATCH 03/14] #164 - Introduce checkstyle - Change GH action to build up to the verify stage which includes checkstyle --- .github/workflows/maven.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/maven.yml b/.github/workflows/maven.yml index 821b99dc..b9e759a0 100644 --- a/.github/workflows/maven.yml +++ b/.github/workflows/maven.yml @@ -41,4 +41,4 @@ jobs: distribution: temurin java-version: ${{ matrix.java }} - name: Build with Maven - run: mvn -V clean package --no-transfer-progress \ No newline at end of file + run: mvn -V clean verify --no-transfer-progress From d74ee2675640abf9efde2ea12f12b895c9e9860b Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:23:40 +0100 Subject: [PATCH 04/14] #164 - Introduce checkstyle - Organize imports --- .../org/dkpro/jwpl/api/PerformanceIT.java | 10 ++-- .../org/dkpro/jwpl/api/PerformanceTest.java | 10 ++-- .../main/java/org/dkpro/jwpl/api/Page.java | 12 ++--- .../java/org/dkpro/jwpl/api/PageIterator.java | 4 +- .../org/dkpro/jwpl/api/WikiConstants.java | 10 ++-- .../org/dkpro/jwpl/api/WikipediaInfo.java | 3 +- .../org/dkpro/jwpl/api/hibernate/PageDAO.java | 4 +- .../jwpl/api/sweble/PlainTextConverter.java | 53 ++++++++++--------- .../api/sweble/TemplateNameExtractor.java | 7 +-- .../dkpro/jwpl/util/HibernateUtilities.java | 5 +- .../api/CategoryDescendantsIteratorTest.java | 7 ++- .../org/dkpro/jwpl/api/CategoryGraphTest.java | 9 ++-- .../dkpro/jwpl/api/CategoryIteratorTest.java | 6 +-- .../java/org/dkpro/jwpl/api/CategoryTest.java | 17 +++--- .../java/org/dkpro/jwpl/api/MetaDataTest.java | 15 +++--- .../org/dkpro/jwpl/api/PageIteratorTest.java | 8 +-- .../java/org/dkpro/jwpl/api/PageTest.java | 22 ++++---- .../org/dkpro/jwpl/api/TitleIteratorTest.java | 6 +-- .../java/org/dkpro/jwpl/api/TitleTest.java | 7 ++- .../org/dkpro/jwpl/api/WikiConfigTest.java | 5 +- .../org/dkpro/jwpl/api/WikipediaTest.java | 25 ++++----- .../jwpl/api/util/GraphSerializationTest.java | 10 ++-- .../version/SingleDumpVersionOriginal.java | 1 + .../org/dkpro/jwpl/mwdumper/dumper/Tools.java | 6 +-- .../jwpl/mwdumper/importer/TitleTest.java | 8 +-- .../dkpro/jwpl/parser/html/HtmlWriter.java | 2 +- .../parser/selectiveaccess/ConfigLoader.java | 5 +- .../SelectiveAccessHandler.java | 3 +- .../org/dkpro/jwpl/parser/ParsedPageTest.java | 9 ++-- .../api/AbstractRevisionService.java | 8 +-- .../jwpl/revisionmachine/api/Revision.java | 1 - .../jwpl/revisionmachine/api/RevisionApi.java | 7 ++- .../archivers/Bzip2Archiver.java | 6 +-- .../revisionmachine/difftool/DiffTool.java | 3 +- .../difftool/config/ConfigurationReader.java | 13 +++-- .../article/reader/WikipediaXMLReader.java | 13 +++-- .../consumer/dump/codec/DataFileEncoder.java | 2 - .../consumer/dump/codec/TimedSQLEncoder.java | 1 - .../jwpl/revisionmachine/RevisionApiTest.java | 21 ++++---- .../revisionmachine/RevisionIteratorTest.java | 19 ++++--- .../version/DumpVersionFastUtilIntKey.java | 7 +-- .../jwpl/tutorial/parser/T7_HtmlFileDemo.java | 2 +- .../util/templates/parser/ParseUtils.java | 8 +-- .../templates/parser/SectionExtractor.java | 27 ++++++++-- .../decompression/BZip2Decompressor.java | 4 +- .../dump/xml/AbstractXmlDumpReader.java | 18 ++++--- .../wikimachine/factory/SpringFactory.java | 9 ++-- 47 files changed, 232 insertions(+), 226 deletions(-) diff --git a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java index b2bc34d7..1a621a36 100644 --- a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java +++ b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java @@ -17,20 +17,18 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; - import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.lang.invoke.MethodHandles; import java.util.Properties; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import org.junit.jupiter.api.BeforeEach; +import org.dkpro.jwpl.api.exception.WikiApiException; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class PerformanceIT implements WikiConstants { diff --git a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java index da7a54cf..ce0d3adc 100644 --- a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java +++ b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java @@ -17,10 +17,7 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.dkpro.jwpl.util.GraphUtilities; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static org.junit.jupiter.api.Assertions.assertNotNull; import java.lang.invoke.MethodHandles; import java.util.ArrayList; @@ -28,7 +25,10 @@ import java.util.List; import java.util.Set; -import static org.junit.jupiter.api.Assertions.assertNotNull; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.dkpro.jwpl.util.GraphUtilities; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; /** * Encapsulates the integration test code that stresses a Wikipedia backend to check the performance of it. diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java index c46be17c..e56c56f7 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java @@ -20,6 +20,12 @@ import java.util.HashSet; import java.util.Set; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; +import org.dkpro.jwpl.api.exception.WikiTitleParsingException; +import org.dkpro.jwpl.api.hibernate.PageDAO; +import org.dkpro.jwpl.api.sweble.PlainTextConverter; +import org.dkpro.jwpl.util.UnmodifiableArraySet; import org.hibernate.LockOptions; import org.hibernate.Session; import org.hibernate.type.StandardBasicTypes; @@ -29,12 +35,6 @@ import org.sweble.wikitext.engine.nodes.EngProcessedPage; import de.fau.cs.osr.ptk.common.AstVisitor; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; -import org.dkpro.jwpl.api.exception.WikiTitleParsingException; -import org.dkpro.jwpl.api.hibernate.PageDAO; -import org.dkpro.jwpl.api.sweble.PlainTextConverter; -import org.dkpro.jwpl.util.UnmodifiableArraySet; /** * Represents a Wikipedia article page. diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java index 1f985978..41416219 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java @@ -24,13 +24,13 @@ import java.util.List; import java.util.Set; -import jakarta.persistence.TypedQuery; - import org.dkpro.jwpl.api.exception.WikiApiException; import org.hibernate.Session; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import jakarta.persistence.TypedQuery; + /** * An {@link Iterator} over {@link Page} objects. */ diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java index b5c2539f..b6ef2f57 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java @@ -17,15 +17,17 @@ */ package org.dkpro.jwpl.api; -import com.neovisionaries.i18n.LanguageCode; +import java.io.IOException; +import java.util.List; + +import javax.xml.parsers.ParserConfigurationException; + import org.sweble.wikitext.engine.config.WikiConfig; import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; import org.sweble.wikitext.engine.utils.LanguageConfigGenerator; import org.xml.sax.SAXException; -import javax.xml.parsers.ParserConfigurationException; -import java.io.IOException; -import java.util.List; +import com.neovisionaries.i18n.LanguageCode; public interface WikiConstants { /** diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java index 6d3d3cef..27b5dc13 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java @@ -26,11 +26,10 @@ import java.util.Map; import java.util.Set; -import org.hibernate.Session; - import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; import org.dkpro.jwpl.util.ApiUtilities; +import org.hibernate.Session; import org.slf4j.Logger; import org.slf4j.LoggerFactory; diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java index 1e3e86cf..dfaf4d29 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java @@ -17,12 +17,12 @@ */ package org.dkpro.jwpl.api.hibernate; +import java.lang.invoke.MethodHandles; + import org.dkpro.jwpl.api.Wikipedia; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import java.lang.invoke.MethodHandles; - /** * Data access object for class {@link Page}. * diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java index d77d2c49..38c4222e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java @@ -17,17 +17,12 @@ */ package org.dkpro.jwpl.api.sweble; -/* - * Derived from the TextConverter class which was published in the - * Sweble example project provided on - * http://http://sweble.org by the Open Source Research Group, - * University of Erlangen-Nürnberg under the Apache License, Version 2.0 - * (http://www.apache.org/licenses/LICENSE-2.0) - */ +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.LinkedList; +import java.util.List; +import java.util.regex.Pattern; -import de.fau.cs.osr.ptk.common.AstVisitor; -import de.fau.cs.osr.ptk.common.ast.AstText; -import de.fau.cs.osr.utils.StringTools; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.sweble.wikitext.engine.PageTitle; @@ -38,41 +33,47 @@ import org.sweble.wikitext.parser.nodes.WtHorizontalRule; import org.sweble.wikitext.parser.nodes.WtIllegalCodePoint; import org.sweble.wikitext.parser.nodes.WtImageLink; +import org.sweble.wikitext.parser.nodes.WtInnerNode2; import org.sweble.wikitext.parser.nodes.WtInternalLink; import org.sweble.wikitext.parser.nodes.WtItalics; -import org.sweble.wikitext.parser.nodes.WtInnerNode2; +import org.sweble.wikitext.parser.nodes.WtLinkTitle; import org.sweble.wikitext.parser.nodes.WtListItem; import org.sweble.wikitext.parser.nodes.WtNode; import org.sweble.wikitext.parser.nodes.WtNodeList; -import org.sweble.wikitext.parser.nodes.WtParagraph; import org.sweble.wikitext.parser.nodes.WtPage; +import org.sweble.wikitext.parser.nodes.WtParagraph; import org.sweble.wikitext.parser.nodes.WtSection; -import org.sweble.wikitext.parser.nodes.WtUrl; -import org.sweble.wikitext.parser.nodes.WtWhitespace; -import org.sweble.wikitext.parser.nodes.WtXmlElement; -import org.sweble.wikitext.parser.nodes.WtTagExtension; import org.sweble.wikitext.parser.nodes.WtTable; import org.sweble.wikitext.parser.nodes.WtTableCaption; -import org.sweble.wikitext.parser.nodes.WtTableHeader; -import org.sweble.wikitext.parser.nodes.WtTableRow; import org.sweble.wikitext.parser.nodes.WtTableCell; +import org.sweble.wikitext.parser.nodes.WtTableHeader; import org.sweble.wikitext.parser.nodes.WtTableImplicitTableBody; +import org.sweble.wikitext.parser.nodes.WtTableRow; +import org.sweble.wikitext.parser.nodes.WtTagExtension; import org.sweble.wikitext.parser.nodes.WtTemplate; import org.sweble.wikitext.parser.nodes.WtTemplateArgument; import org.sweble.wikitext.parser.nodes.WtTemplateParameter; -import org.sweble.wikitext.parser.nodes.WtLinkTitle; +import org.sweble.wikitext.parser.nodes.WtUrl; +import org.sweble.wikitext.parser.nodes.WtWhitespace; import org.sweble.wikitext.parser.nodes.WtXmlAttribute; -import org.sweble.wikitext.parser.nodes.WtXmlComment; import org.sweble.wikitext.parser.nodes.WtXmlCharRef; -import org.sweble.wikitext.parser.nodes.WtXmlEntityRef; +import org.sweble.wikitext.parser.nodes.WtXmlComment; +import org.sweble.wikitext.parser.nodes.WtXmlElement; import org.sweble.wikitext.parser.nodes.WtXmlEndTag; +import org.sweble.wikitext.parser.nodes.WtXmlEntityRef; import org.sweble.wikitext.parser.parser.LinkTargetException; -import java.lang.invoke.MethodHandles; -import java.util.ArrayList; -import java.util.LinkedList; -import java.util.List; -import java.util.regex.Pattern; +/* + * Derived from the TextConverter class which was published in the + * Sweble example project provided on + * http://http://sweble.org by the Open Source Research Group, + * University of Erlangen-Nürnberg under the Apache License, Version 2.0 + * (http://www.apache.org/licenses/LICENSE-2.0) + */ + +import de.fau.cs.osr.ptk.common.AstVisitor; +import de.fau.cs.osr.ptk.common.ast.AstText; +import de.fau.cs.osr.utils.StringTools; /** * A visitor to convert an article AST into a plain text representation. To diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java index b1f1a28e..89881f5b 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java @@ -29,14 +29,15 @@ import java.util.LinkedList; import java.util.List; -import de.fau.cs.osr.ptk.common.AstVisitor; -import de.fau.cs.osr.ptk.common.ast.AstNode; -import de.fau.cs.osr.ptk.common.ast.AstText; import org.sweble.wikitext.engine.config.WikiConfig; import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; import org.sweble.wikitext.parser.nodes.WtNode; import org.sweble.wikitext.parser.nodes.WtTemplate; +import de.fau.cs.osr.ptk.common.AstVisitor; +import de.fau.cs.osr.ptk.common.ast.AstNode; +import de.fau.cs.osr.ptk.common.ast.AstText; + /** * A visitor that extracts template names (no parameters) from an article AST. */ diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java index 9ef81ae9..0d8fd3ce 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java @@ -20,11 +20,10 @@ import java.util.HashMap; import java.util.Map; -import org.dkpro.jwpl.api.WikiConstants; -import org.hibernate.Session; - import org.dkpro.jwpl.api.DatabaseConfiguration; +import org.dkpro.jwpl.api.WikiConstants; import org.dkpro.jwpl.api.hibernate.WikiHibernateUtil; +import org.hibernate.Session; /** * @deprecated To be removed without replacement. diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java index f46013b0..b315516a 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java @@ -17,18 +17,17 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import org.dkpro.jwpl.api.exception.WikiApiException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; - public class CategoryDescendantsIteratorTest extends BaseJWPLTest{ /** diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java index 13fa9a65..93c419c2 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java @@ -17,17 +17,16 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import java.util.Map; +import org.dkpro.jwpl.api.exception.WikiApiException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - public class CategoryGraphTest extends BaseJWPLTest{ private static CategoryGraph catGraph; diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java index bbf6b47e..f47ee279 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java @@ -17,14 +17,14 @@ */ package org.dkpro.jwpl.api; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + import java.util.Iterator; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; - public class CategoryIteratorTest extends BaseJWPLTest { /** diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java index a12c8528..0e13f7f4 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java @@ -17,9 +17,11 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; -import org.dkpro.jwpl.api.exception.WikiTitleParsingException; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import java.util.ArrayList; import java.util.Collections; @@ -28,15 +30,12 @@ import java.util.Set; import java.util.UUID; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; +import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - public class CategoryTest extends BaseJWPLTest { private static final String A_FAMOUS_CATEGORY = "People of UKP"; diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java index 3c349679..877b4182 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java @@ -17,19 +17,18 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; - -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.fail; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + public class MetaDataTest extends BaseJWPLTest { // The object under test diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java index e2dd8c11..9cf00e96 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java @@ -17,15 +17,15 @@ */ package org.dkpro.jwpl.api; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.fail; + import java.util.Iterator; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.fail; - public class PageIteratorTest extends BaseJWPLTest { /** diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java index c618fdc5..3f617e2a 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java @@ -17,28 +17,26 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; -import org.dkpro.jwpl.api.exception.WikiTitleParsingException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.List; import java.util.Set; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; +import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class PageTest extends BaseJWPLTest { diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java index a4cc38b2..ecf3545b 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java @@ -17,13 +17,13 @@ */ package org.dkpro.jwpl.api; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.fail; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + public class TitleIteratorTest extends BaseJWPLTest{ /** diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java index 4c85f89e..01ea2e99 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java @@ -17,13 +17,12 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiTitleParsingException; - -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNull; +import org.dkpro.jwpl.api.exception.WikiTitleParsingException; +import org.junit.jupiter.api.Test; + public class TitleTest { @Test diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java index 48289fab..a921f16c 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java @@ -17,11 +17,10 @@ */ package org.dkpro.jwpl.api; -import org.sweble.wikitext.engine.config.WikiConfig; +import static org.junit.jupiter.api.Assertions.assertSame; import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertSame; +import org.sweble.wikitext.engine.config.WikiConfig; public class WikiConfigTest { diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java index be18194c..797dfca2 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java @@ -17,28 +17,25 @@ */ package org.dkpro.jwpl.api; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; -import org.dkpro.jwpl.api.exception.WikiTitleParsingException; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; import java.lang.invoke.MethodHandles; import java.util.List; import java.util.Set; import java.util.UUID; - +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; +import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertFalse; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class WikipediaTest extends BaseJWPLTest{ diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java index f927a02d..7a4cc8d4 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java @@ -17,6 +17,10 @@ */ package org.dkpro.jwpl.api.util; +import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.fail; + import java.io.File; import org.dkpro.jwpl.api.BaseJWPLTest; @@ -27,14 +31,10 @@ import org.jgrapht.graph.DefaultDirectedGraph; import org.jgrapht.graph.DefaultEdge; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertDoesNotThrow; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.fail; - /** * Tests for the correctness of the Category graph construction and its serialization
* process. diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java index 27c29383..51c297bf 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java @@ -34,6 +34,7 @@ import org.dkpro.jwpl.wikimachine.dump.xml.TextParser; import org.dkpro.jwpl.wikimachine.util.Redirects; import org.dkpro.jwpl.wikimachine.util.TxtFileWriter; + import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; import it.unimi.dsi.fastutil.ints.IntArraySet; import it.unimi.dsi.fastutil.ints.IntSet; diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java index ace8ea6f..a36eed96 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java @@ -17,9 +17,6 @@ */ package org.dkpro.jwpl.mwdumper.dumper; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; - import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; @@ -30,6 +27,9 @@ import java.io.OutputStream; import java.util.zip.GZIPInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; + public class Tools { static final int IN_BUF_SZ = 1024 * 1024; private static final int OUT_BUF_SZ = 1024 * 1024; diff --git a/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java b/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java index 8aa596bd..b8bc6c4f 100644 --- a/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java +++ b/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java @@ -25,16 +25,16 @@ */ -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + public class TitleTest { NamespaceSet namespaces; diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java index 69460d14..1b08d20c 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java @@ -40,8 +40,8 @@ import org.dkpro.jwpl.parser.Paragraph; import org.dkpro.jwpl.parser.ParsedPage; import org.dkpro.jwpl.parser.Section; -import org.dkpro.jwpl.parser.SectionContent; import org.dkpro.jwpl.parser.SectionContainer; +import org.dkpro.jwpl.parser.SectionContent; import org.dkpro.jwpl.parser.Span; import org.dkpro.jwpl.parser.Table; import org.dkpro.jwpl.parser.TableElement; diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java index 426cf2c3..939abe0f 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java @@ -20,11 +20,10 @@ import java.util.EnumMap; import java.util.Map; -import org.xml.sax.Attributes; -import org.xml.sax.helpers.DefaultHandler; - import org.dkpro.jwpl.parser.selectiveaccess.SelectiveAccessHandler.CIT; import org.dkpro.jwpl.parser.selectiveaccess.SelectiveAccessHandler.SIT; +import org.xml.sax.Attributes; +import org.xml.sax.helpers.DefaultHandler; class ConfigLoader extends DefaultHandler { final SelectiveAccessHandler sah; diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java index e8d3d21c..23f54558 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java @@ -28,8 +28,6 @@ import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import org.xml.sax.helpers.DefaultHandler; - import org.dkpro.jwpl.parser.Content; import org.dkpro.jwpl.parser.Content.FormatType; import org.dkpro.jwpl.parser.DefinitionList; @@ -42,6 +40,7 @@ import org.dkpro.jwpl.parser.SectionContent; import org.dkpro.jwpl.parser.Span; import org.dkpro.jwpl.parser.Table; +import org.xml.sax.helpers.DefaultHandler; /** * Provides access to a ParsedPage at an abstract Level. diff --git a/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java b/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java index b65f4557..6aa765dc 100644 --- a/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java +++ b/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java @@ -17,6 +17,10 @@ * limitations under the License. */ +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.fail; + import org.dkpro.jwpl.api.DatabaseConfiguration; import org.dkpro.jwpl.api.Page; import org.dkpro.jwpl.api.WikiConstants.Language; @@ -24,14 +28,9 @@ import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.parser.mediawiki.MediaWikiParser; import org.dkpro.jwpl.parser.mediawiki.MediaWikiParserFactory; - import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.fail; - public class ParsedPageTest extends BaseJWPLTest{ private static final String LF = "\n"; diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java index 4e0932b3..ee562855 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java @@ -17,14 +17,14 @@ */ package org.dkpro.jwpl.revisionmachine.api; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - import java.sql.Connection; import java.sql.DriverManager; import java.sql.SQLException; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * A common base class that handles the aspect of database connection handling. */ diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java index 90f94b1e..cfa5add3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java @@ -24,7 +24,6 @@ import java.util.Collection; import org.apache.commons.lang3.StringEscapeUtils; - import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.ISizeable; import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.content.DiffPart; diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java index 4fa6e07c..29e32d04 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java @@ -18,7 +18,12 @@ package org.dkpro.jwpl.revisionmachine.api; import java.io.IOException; -import java.sql.*; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Timestamp; +import java.sql.Types; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java index 0c985d70..f03d757c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java @@ -17,9 +17,6 @@ */ package org.dkpro.jwpl.revisionmachine.archivers; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; - import java.io.BufferedInputStream; import java.io.BufferedOutputStream; import java.io.File; @@ -29,6 +26,9 @@ import java.io.InputStreamReader; import java.io.OutputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; + /** * Class provides basic bzip2 compression/decompression functionality */ diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java index 3f12f6cd..fbf05942 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java @@ -21,10 +21,9 @@ import javax.xml.parsers.ParserConfigurationException; -import org.xml.sax.SAXException; - import org.dkpro.jwpl.revisionmachine.difftool.config.ConfigurationReader; import org.dkpro.jwpl.revisionmachine.difftool.config.gui.control.ConfigSettings; +import org.xml.sax.SAXException; /** * This class contains the start method for the DiffTool application. diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java index a1f4ff55..cd264abd 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java @@ -25,19 +25,18 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import org.slf4j.event.Level; -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.SAXException; - import org.dkpro.jwpl.revisionmachine.difftool.config.gui.control.ConfigSettings; import org.dkpro.jwpl.revisionmachine.difftool.config.gui.data.ConfigEnum; import org.dkpro.jwpl.revisionmachine.difftool.data.OutputType; import org.dkpro.jwpl.revisionmachine.difftool.data.SurrogateModes; import org.dkpro.jwpl.revisionmachine.difftool.data.archive.ArchiveDescription; import org.dkpro.jwpl.revisionmachine.difftool.data.archive.InputType; +import org.slf4j.event.Level; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; /** * This Reader reads the xml-configuration files for the DiffTool. diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java index 0a1fc9cb..6d39083a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java @@ -26,13 +26,6 @@ import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; -import org.w3c.dom.Document; -import org.w3c.dom.NamedNodeMap; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; - import org.dkpro.jwpl.revisionmachine.api.Revision; import org.dkpro.jwpl.revisionmachine.common.exceptions.ArticleReaderException; import org.dkpro.jwpl.revisionmachine.common.exceptions.ConfigurationException; @@ -47,6 +40,12 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task; import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.TaskTypes; import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.info.ArticleInformation; +import org.w3c.dom.Document; +import org.w3c.dom.NamedNodeMap; +import org.w3c.dom.Node; +import org.w3c.dom.NodeList; +import org.xml.sax.InputSource; +import org.xml.sax.SAXException; /** * This class parses the wikipedia xml format. diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java index f954e7b2..25b3d3b9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java @@ -24,8 +24,6 @@ import org.dkpro.jwpl.revisionmachine.common.exceptions.ConfigurationException; import org.dkpro.jwpl.revisionmachine.common.exceptions.DecodingException; import org.dkpro.jwpl.revisionmachine.common.exceptions.EncodingException; -import org.dkpro.jwpl.revisionmachine.common.exceptions.SQLConsumerException; -import org.dkpro.jwpl.revisionmachine.difftool.data.codec.RevisionCodecData; import org.dkpro.jwpl.revisionmachine.difftool.data.codec.RevisionEncoder; import org.dkpro.jwpl.revisionmachine.difftool.data.codec.RevisionEncoderInterface; import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task; diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java index 0575795f..8704a0b8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java @@ -22,7 +22,6 @@ import org.dkpro.jwpl.revisionmachine.common.exceptions.ConfigurationException; import org.dkpro.jwpl.revisionmachine.common.exceptions.DecodingException; import org.dkpro.jwpl.revisionmachine.common.exceptions.EncodingException; -import org.dkpro.jwpl.revisionmachine.common.exceptions.LoggingException; import org.dkpro.jwpl.revisionmachine.common.exceptions.SQLConsumerException; import org.dkpro.jwpl.revisionmachine.common.logging.Logger; import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task; diff --git a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java index d032f033..436e6a5f 100644 --- a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java +++ b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java @@ -17,12 +17,9 @@ */ package org.dkpro.jwpl.revisionmachine; -import org.dkpro.jwpl.api.DatabaseConfiguration; -import org.dkpro.jwpl.api.WikiConstants.Language; -import org.dkpro.jwpl.api.Wikipedia; -import org.dkpro.jwpl.api.exception.WikiApiException; -import org.dkpro.jwpl.revisionmachine.api.Revision; -import org.dkpro.jwpl.revisionmachine.api.RevisionApi; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.fail; import java.lang.reflect.Field; import java.sql.SQLException; @@ -31,16 +28,18 @@ import java.time.ZoneOffset; import java.util.Calendar; +import org.dkpro.jwpl.api.DatabaseConfiguration; +import org.dkpro.jwpl.api.WikiConstants.Language; +import org.dkpro.jwpl.api.Wikipedia; +import org.dkpro.jwpl.api.exception.WikiApiException; +import org.dkpro.jwpl.revisionmachine.api.Revision; +import org.dkpro.jwpl.revisionmachine.api.RevisionApi; import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.fail; - public class RevisionApiTest extends BaseJWPLTest { private static Wikipedia wiki = null; diff --git a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java index ed43efec..19a2ec40 100644 --- a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java +++ b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java @@ -17,6 +17,14 @@ */ package org.dkpro.jwpl.revisionmachine; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assertions.fail; + +import java.sql.SQLException; +import java.util.ArrayList; + import org.dkpro.jwpl.api.DatabaseConfiguration; import org.dkpro.jwpl.api.WikiConstants.Language; import org.dkpro.jwpl.api.Wikipedia; @@ -24,20 +32,11 @@ import org.dkpro.jwpl.revisionmachine.api.Revision; import org.dkpro.jwpl.revisionmachine.api.RevisionAPIConfiguration; import org.dkpro.jwpl.revisionmachine.api.RevisionIterator; - -import java.sql.SQLException; -import java.util.ArrayList; - import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - public class RevisionIteratorTest extends BaseJWPLTest { // Note: In the stripped HSQLDB data set only 382 revisions exist for the Page 'Car' diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java index 2e3f19f1..d77cd08a 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java @@ -22,9 +22,6 @@ import java.util.HashMap; import java.util.Map; -import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; -import it.unimi.dsi.fastutil.ints.IntArraySet; -import it.unimi.dsi.fastutil.ints.IntSet; import org.dkpro.jwpl.timemachine.domain.Revision; import org.dkpro.jwpl.wikimachine.dump.sql.CategorylinksParser; import org.dkpro.jwpl.wikimachine.dump.sql.PagelinksParser; @@ -36,6 +33,10 @@ import org.dkpro.jwpl.wikimachine.util.TimestampUtil; import org.dkpro.jwpl.wikimachine.util.TxtFileWriter; +import it.unimi.dsi.fastutil.ints.Int2IntOpenHashMap; +import it.unimi.dsi.fastutil.ints.IntArraySet; +import it.unimi.dsi.fastutil.ints.IntSet; + public class DumpVersionFastUtilIntKey extends AbstractDumpVersion { private static final String SQL_NULL = "NULL"; /** diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java index ae16bcc1..8e5bec69 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java @@ -18,9 +18,9 @@ package org.dkpro.jwpl.tutorial.parser; import org.dkpro.jwpl.parser.ParsedPage; +import org.dkpro.jwpl.parser.html.HtmlWriter; import org.dkpro.jwpl.parser.mediawiki.MediaWikiParser; import org.dkpro.jwpl.parser.mediawiki.MediaWikiParserFactory; -import org.dkpro.jwpl.parser.html.HtmlWriter; /** * This class shows how to use the HtmlTools.class...
diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/ParseUtils.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/ParseUtils.java index d6a56b5a..1217adb9 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/ParseUtils.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/ParseUtils.java @@ -21,18 +21,18 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.sweble.wikitext.engine.config.WikiConfig; +import org.dkpro.jwpl.api.sweble.TemplateNameExtractor; +import org.dkpro.jwpl.util.templates.parser.SectionExtractor.ExtractedSection; import org.sweble.wikitext.engine.EngineException; -import org.sweble.wikitext.engine.nodes.EngProcessedPage; import org.sweble.wikitext.engine.PageId; import org.sweble.wikitext.engine.PageTitle; import org.sweble.wikitext.engine.WtEngineImpl; +import org.sweble.wikitext.engine.config.WikiConfig; +import org.sweble.wikitext.engine.nodes.EngProcessedPage; import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; import org.sweble.wikitext.parser.parser.LinkTargetException; import de.fau.cs.osr.ptk.common.AstVisitor; -import org.dkpro.jwpl.api.sweble.TemplateNameExtractor; -import org.dkpro.jwpl.util.templates.parser.SectionExtractor.ExtractedSection; public class ParseUtils { diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/SectionExtractor.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/SectionExtractor.java index e6d70fb3..313ba87b 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/SectionExtractor.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/parser/SectionExtractor.java @@ -44,15 +44,34 @@ import java.util.ArrayList; import java.util.List; -import de.fau.cs.osr.ptk.common.AstVisitor; -import de.fau.cs.osr.ptk.common.ast.AstNode; -import de.fau.cs.osr.ptk.common.ast.AstText; import org.sweble.wikitext.engine.PageTitle; import org.sweble.wikitext.engine.config.WikiConfig; import org.sweble.wikitext.engine.utils.DefaultConfigEnWp; -import org.sweble.wikitext.parser.nodes.*; +import org.sweble.wikitext.parser.nodes.WtBold; +import org.sweble.wikitext.parser.nodes.WtDefinitionList; +import org.sweble.wikitext.parser.nodes.WtDefinitionListDef; +import org.sweble.wikitext.parser.nodes.WtDefinitionListTerm; +import org.sweble.wikitext.parser.nodes.WtExternalLink; +import org.sweble.wikitext.parser.nodes.WtInternalLink; +import org.sweble.wikitext.parser.nodes.WtItalics; +import org.sweble.wikitext.parser.nodes.WtLinkTarget; +import org.sweble.wikitext.parser.nodes.WtLinkTitle; +import org.sweble.wikitext.parser.nodes.WtNode; +import org.sweble.wikitext.parser.nodes.WtNodeList; +import org.sweble.wikitext.parser.nodes.WtPage; +import org.sweble.wikitext.parser.nodes.WtParagraph; +import org.sweble.wikitext.parser.nodes.WtSection; +import org.sweble.wikitext.parser.nodes.WtTemplate; +import org.sweble.wikitext.parser.nodes.WtWhitespace; +import org.sweble.wikitext.parser.nodes.WtXmlEmptyTag; +import org.sweble.wikitext.parser.nodes.WtXmlEndTag; +import org.sweble.wikitext.parser.nodes.WtXmlStartTag; import org.sweble.wikitext.parser.parser.LinkTargetException; +import de.fau.cs.osr.ptk.common.AstVisitor; +import de.fau.cs.osr.ptk.common.ast.AstNode; +import de.fau.cs.osr.ptk.common.ast.AstText; + /** * A visitor that extracts sections from an article AST. */ diff --git a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java index 51ca2a0e..189c75b6 100644 --- a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java +++ b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/decompression/BZip2Decompressor.java @@ -17,13 +17,13 @@ */ package org.dkpro.jwpl.wikimachine.decompression; -import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; - import java.io.BufferedInputStream; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; +import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; + /** * BZip2 Decompressor (based on Singleton Design Pattern). Uses getInputStream * to set up the archive path and returns the InputStream to read from diff --git a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/dump/xml/AbstractXmlDumpReader.java b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/dump/xml/AbstractXmlDumpReader.java index f18df705..f4ea808a 100644 --- a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/dump/xml/AbstractXmlDumpReader.java +++ b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/dump/xml/AbstractXmlDumpReader.java @@ -28,19 +28,18 @@ import java.io.InputStream; import java.lang.invoke.MethodHandles; import java.time.ZoneId; -import java.util.*; +import java.util.Calendar; +import java.util.GregorianCalendar; +import java.util.HashMap; +import java.util.Map; +import java.util.Objects; +import java.util.TimeZone; import javax.xml.XMLConstants; import javax.xml.parsers.ParserConfigurationException; import javax.xml.parsers.SAXParser; import javax.xml.parsers.SAXParserFactory; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.Attributes; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - import org.dkpro.jwpl.mwdumper.importer.Contributor; import org.dkpro.jwpl.mwdumper.importer.DumpWriter; import org.dkpro.jwpl.mwdumper.importer.NamespaceSet; @@ -48,6 +47,11 @@ import org.dkpro.jwpl.mwdumper.importer.Revision; import org.dkpro.jwpl.mwdumper.importer.Siteinfo; import org.dkpro.jwpl.mwdumper.importer.Title; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; /** * Parser of WikiMedia XML dumps. Modification of XmlDumpReader with some enhanced error diff --git a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/factory/SpringFactory.java b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/factory/SpringFactory.java index 5cde7dbc..bd8e8711 100644 --- a/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/factory/SpringFactory.java +++ b/dkpro-jwpl-wikimachine/src/main/java/org/dkpro/jwpl/wikimachine/factory/SpringFactory.java @@ -19,11 +19,6 @@ import java.io.File; -import org.springframework.beans.factory.BeanFactory; -import org.springframework.context.support.AbstractXmlApplicationContext; -import org.springframework.context.support.ClassPathXmlApplicationContext; -import org.springframework.context.support.FileSystemXmlApplicationContext; - import org.dkpro.jwpl.wikimachine.debug.ILogger; import org.dkpro.jwpl.wikimachine.decompression.IDecompressor; import org.dkpro.jwpl.wikimachine.domain.DumpVersionProcessor; @@ -33,6 +28,10 @@ import org.dkpro.jwpl.wikimachine.dump.xml.PageParser; import org.dkpro.jwpl.wikimachine.dump.xml.RevisionParser; import org.dkpro.jwpl.wikimachine.dump.xml.TextParser; +import org.springframework.beans.factory.BeanFactory; +import org.springframework.context.support.AbstractXmlApplicationContext; +import org.springframework.context.support.ClassPathXmlApplicationContext; +import org.springframework.context.support.FileSystemXmlApplicationContext; public class SpringFactory implements IEnvironmentFactory { From b94153e45ce3e93201774a18b0e2c01cb921210a Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:25:28 +0100 Subject: [PATCH 05/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-api --- .../org/dkpro/jwpl/api/PerformanceIT.java | 68 +- .../org/dkpro/jwpl/api/PerformanceTest.java | 62 +- .../java/org/dkpro/jwpl/api/Category.java | 750 ++-- .../jwpl/api/CategoryDescendantsIterable.java | 50 +- .../jwpl/api/CategoryDescendantsIterator.java | 267 +- .../org/dkpro/jwpl/api/CategoryGraph.java | 3239 +++++++++-------- .../dkpro/jwpl/api/CategoryGraphManager.java | 161 +- .../org/dkpro/jwpl/api/CategoryIterable.java | 57 +- .../org/dkpro/jwpl/api/CategoryIterator.java | 251 +- .../jwpl/api/CategoryTitleComparator.java | 24 +- .../java/org/dkpro/jwpl/api/CycleHandler.java | 186 +- .../dkpro/jwpl/api/DatabaseConfiguration.java | 362 +- .../java/org/dkpro/jwpl/api/MetaData.java | 246 +- .../main/java/org/dkpro/jwpl/api/Page.java | 1209 +++--- .../java/org/dkpro/jwpl/api/PageIterable.java | 70 +- .../java/org/dkpro/jwpl/api/PageIterator.java | 399 +- .../java/org/dkpro/jwpl/api/PageQuery.java | 605 +-- .../org/dkpro/jwpl/api/PageQueryIterable.java | 262 +- .../org/dkpro/jwpl/api/PageQueryIterator.java | 64 +- .../dkpro/jwpl/api/PageTitleComparator.java | 22 +- .../main/java/org/dkpro/jwpl/api/Title.java | 285 +- .../org/dkpro/jwpl/api/TitleIterable.java | 57 +- .../org/dkpro/jwpl/api/TitleIterator.java | 231 +- .../org/dkpro/jwpl/api/WikiConstants.java | 377 +- .../java/org/dkpro/jwpl/api/Wikipedia.java | 1641 +++++---- .../org/dkpro/jwpl/api/WikipediaInfo.java | 724 ++-- .../jwpl/api/exception/WikiApiException.java | 45 +- .../jwpl/api/exception/WikiException.java | 34 +- .../WikiInitializationException.java | 46 +- .../exception/WikiPageNotFoundException.java | 46 +- .../exception/WikiTitleParsingException.java | 35 +- .../dkpro/jwpl/api/hibernate/Category.java | 139 +- .../dkpro/jwpl/api/hibernate/CategoryDAO.java | 80 +- .../dkpro/jwpl/api/hibernate/GenericDAO.java | 193 +- .../dkpro/jwpl/api/hibernate/MetaData.java | 202 +- .../dkpro/jwpl/api/hibernate/MetaDataDAO.java | 81 +- .../org/dkpro/jwpl/api/hibernate/Page.java | 219 +- .../org/dkpro/jwpl/api/hibernate/PageDAO.java | 80 +- .../dkpro/jwpl/api/hibernate/PageMapLine.java | 94 +- .../jwpl/api/hibernate/WikiHibernateUtil.java | 107 +- .../jwpl/api/sweble/PlainTextConverter.java | 985 ++--- .../api/sweble/TemplateNameExtractor.java | 123 +- .../jwpl/api/util/GraphSerialization.java | 171 +- .../api/util/SerializableDirectedGraph.java | 50 +- .../org/dkpro/jwpl/util/ApiUtilities.java | 80 +- .../org/dkpro/jwpl/util/CommonUtilities.java | 80 +- .../java/org/dkpro/jwpl/util/DbUtilities.java | 51 +- .../org/dkpro/jwpl/util/GraphUtilities.java | 102 +- .../dkpro/jwpl/util/HibernateUtilities.java | 87 +- .../src/main/java/org/dkpro/jwpl/util/OS.java | 60 +- .../java/org/dkpro/jwpl/util/StringUtils.java | 150 +- .../dkpro/jwpl/util/UnmodifiableArraySet.java | 197 +- .../distance/LevenshteinStringDistance.java | 116 +- .../jwpl/util/distance/StringDistance.java | 5 +- .../java/org/dkpro/jwpl/api/BaseJWPLTest.java | 33 +- .../api/CategoryDescendantsIteratorTest.java | 58 +- .../org/dkpro/jwpl/api/CategoryGraphTest.java | 80 +- .../dkpro/jwpl/api/CategoryIteratorTest.java | 91 +- .../java/org/dkpro/jwpl/api/CategoryTest.java | 336 +- .../java/org/dkpro/jwpl/api/MetaDataTest.java | 65 +- .../org/dkpro/jwpl/api/PageIteratorTest.java | 116 +- .../java/org/dkpro/jwpl/api/PageTest.java | 883 ++--- .../org/dkpro/jwpl/api/TitleIteratorTest.java | 59 +- .../java/org/dkpro/jwpl/api/TitleTest.java | 183 +- .../org/dkpro/jwpl/api/WikiConfigTest.java | 14 +- .../org/dkpro/jwpl/api/WikipediaTest.java | 875 +++-- .../jwpl/api/util/GraphSerializationTest.java | 168 +- 67 files changed, 9716 insertions(+), 8572 deletions(-) diff --git a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java index 1a621a36..0c8966bf 100644 --- a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java +++ b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceIT.java @@ -30,9 +30,12 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class PerformanceIT implements WikiConstants { +public class PerformanceIT + implements WikiConstants +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); private static Wikipedia wiki; private static int retrievedNumberOfPages; @@ -41,9 +44,11 @@ public class PerformanceIT implements WikiConstants { private static PerformanceTest pt; @BeforeAll - public static void setupWikipedia() throws WikiApiException { + public static void setupWikipedia() throws WikiApiException + { Properties configuration = loadConfiguration(); - retrievedNumberOfPages = Integer.parseInt(configuration.getProperty("performance.pages.retrieved")); + retrievedNumberOfPages = Integer + .parseInt(configuration.getProperty("performance.pages.retrieved")); DatabaseConfiguration dbConfig = obtainITDBConfiguration(configuration); wiki = new Wikipedia(dbConfig); int maxiCycles = Integer.parseInt(configuration.getProperty("performance.cycles.maxi")); @@ -51,28 +56,31 @@ public static void setupWikipedia() throws WikiApiException { pt = new PerformanceTest(wiki, maxiCycles, pageCycles); } - private static DatabaseConfiguration obtainITDBConfiguration(Properties configuration) { + private static DatabaseConfiguration obtainITDBConfiguration(Properties configuration) + { String name = configuration.getProperty("database.name"); String host = configuration.getProperty("database.host"); String user = configuration.getProperty("database.user"); String password = configuration.getProperty("database.password"); // String host, String database, String user, String password, Language language - return new DatabaseConfiguration( - "org.mariadb.jdbc.Driver", - "jdbc:mariadb://" + host + "/" + name + "?serverTimezone=Europe/Berlin&autoReconnect=true&useSSL=false", + return new DatabaseConfiguration("org.mariadb.jdbc.Driver", + "jdbc:mariadb://" + host + "/" + name + + "?serverTimezone=Europe/Berlin&autoReconnect=true&useSSL=false", host, name, user, password, Language.english); } - private static Properties loadConfiguration() { + private static Properties loadConfiguration() + { Properties dbConfigProperties = new Properties(); InputStream resourceStream = checkResourceExists("jwpl-env-configuration.properties"); - if(resourceStream == null) { + if (resourceStream == null) { throw new RuntimeException("Can't find JWPL IT DB configuration in the classpath!"); } else { try (BufferedInputStream stream = new BufferedInputStream(resourceStream)) { dbConfigProperties.load(stream); - } catch(IOException e) { + } + catch (IOException e) { logger.error(e.getLocalizedMessage(), e); throw new RuntimeException("Can't load JWPL IT DB configuration!"); } @@ -80,72 +88,84 @@ private static Properties loadConfiguration() { } } - private static InputStream checkResourceExists(String resourceName) { + private static InputStream checkResourceExists(String resourceName) + { return Thread.currentThread().getContextClassLoader().getResourceAsStream(resourceName); } @BeforeEach - public void setup() throws WikiApiException { + public void setup() throws WikiApiException + { } @Test - public void testPerformanceLoadPagesIntern() throws WikiApiException { + public void testPerformanceLoadPagesIntern() throws WikiApiException + { logger.debug("intern page loading"); pt.loadPagesTest("intern"); } @Test - public void testPerformanceLoadPagesExtern() throws WikiApiException { + public void testPerformanceLoadPagesExtern() throws WikiApiException + { logger.debug("extern page loading"); pt.loadPagesTest("extern"); } @Test - public void testPerformanceLoadPagesAndAccessFieldsIntern() throws WikiApiException { + public void testPerformanceLoadPagesAndAccessFieldsIntern() throws WikiApiException + { logger.debug("intern page loading and field accessing"); pt.loadPagesAndAccessFieldsTest("intern"); } @Test - public void testPerformanceLoadPagesAndAccessFieldsExtern() throws WikiApiException { + public void testPerformanceLoadPagesAndAccessFieldsExtern() throws WikiApiException + { logger.debug("extern page loading and field accessing"); - pt.loadPagesAndAccessFieldsTest("extern"); + pt.loadPagesAndAccessFieldsTest("extern"); } @Test - public void testPerformancePageIteratorBuffer1() throws WikiApiException { + public void testPerformancePageIteratorBuffer1() throws WikiApiException + { logger.debug("Test: retrieve 4000 pages - buffer = '{}' ...", retrievedNumberOfPages, 1); pt.loadPageAndIterate(retrievedNumberOfPages, 1, wiki); } @Test - public void testPerformancePageIteratorBuffer10() throws WikiApiException { + public void testPerformancePageIteratorBuffer10() throws WikiApiException + { logger.debug("Test: retrieve 4000 pages - buffer = '{}' ...", retrievedNumberOfPages, 10); pt.loadPageAndIterate(retrievedNumberOfPages, 10, wiki); } @Test - public void testPerformancePageIteratorBuffer50() throws WikiApiException { + public void testPerformancePageIteratorBuffer50() throws WikiApiException + { logger.debug("Test: retrieve 4000 pages - buffer = '{}' ...", retrievedNumberOfPages, 50); pt.loadPageAndIterate(retrievedNumberOfPages, 50, wiki); } @Test - public void testPerformancePageIteratorBuffer100() throws WikiApiException { + public void testPerformancePageIteratorBuffer100() throws WikiApiException + { logger.debug("Test: retrieve 4000 pages - buffer = '{}' ...", retrievedNumberOfPages, 100); pt.loadPageAndIterate(retrievedNumberOfPages, 100, wiki); } @Test - public void testPerformancePageIteratorBuffer200() throws WikiApiException { + public void testPerformancePageIteratorBuffer200() throws WikiApiException + { logger.debug("Test: retrieve 4000 pages - buffer = '{}' ...", retrievedNumberOfPages, 200); pt.loadPageAndIterate(retrievedNumberOfPages, 200, wiki); } @Test - public void testPerformancePageIteratorBuffer500() throws WikiApiException { + public void testPerformancePageIteratorBuffer500() throws WikiApiException + { logger.debug("Test: retrieve 4000 pages - buffer = '{}' ...", retrievedNumberOfPages, 500); pt.loadPageAndIterate(retrievedNumberOfPages, 500, wiki); } diff --git a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java index ce0d3adc..57fa366d 100644 --- a/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java +++ b/dkpro-jwpl-api/src/it/java/org/dkpro/jwpl/api/PerformanceTest.java @@ -31,11 +31,14 @@ import org.slf4j.LoggerFactory; /** - * Encapsulates the integration test code that stresses a Wikipedia backend to check the performance of it. + * Encapsulates the integration test code that stresses a Wikipedia backend to check the performance + * of it. */ -class PerformanceTest { +class PerformanceTest +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); private final Wikipedia wiki; @@ -47,7 +50,8 @@ class PerformanceTest { private final int maxiCycles; private final int pageCycles; - PerformanceTest(Wikipedia pWiki, int maxiCycles, int pageCycles) throws WikiApiException { + PerformanceTest(Wikipedia pWiki, int maxiCycles, int pageCycles) throws WikiApiException + { this.wiki = pWiki; this.maxiCycles = maxiCycles; this.pageCycles = pageCycles; @@ -55,13 +59,14 @@ class PerformanceTest { initializeLists(pageIDs); } - private void initializeLists(Set allPageIDs) throws WikiApiException { + private void initializeLists(Set allPageIDs) throws WikiApiException + { randomIdList = new ArrayList<>(); randomTitleList = new ArrayList<>(); - for (int j=0; j randomPageIds = GraphUtilities.getRandomPageSubset(allPageIDs, pageCycles); - List randomPageIdList = new ArrayList<>( randomPageIds ); + List randomPageIdList = new ArrayList<>(randomPageIds); randomIdList.add(randomPageIdList); List randomPageTitles = new ArrayList<>(); @@ -74,13 +79,14 @@ private void initializeLists(Set allPageIDs) throws WikiApiException { } - void loadPagesTest(String mode) throws WikiApiException { + void loadPagesTest(String mode) throws WikiApiException + { double averageThroughput = 0; - for (int j=0; j page = GraphUtilities.getRandomPageSubset(pageIDs, 1); Iterator it = page.iterator(); @@ -134,20 +141,24 @@ void loadPagesAndAccessFieldsTest(String mode) throws WikiApiException { logger.debug("-----------------"); logger.debug("average throughput: {} pages/ms", averageThroughput); - logger.debug("average throughput: {} pages/s", averageThroughput*1000); logger.debug("-----------------"); + logger.debug("average throughput: {} pages/s", averageThroughput * 1000); + logger.debug("-----------------"); } - private void loadPage(long id) throws WikiApiException { + private void loadPage(long id) throws WikiApiException + { Page page = new Page(this.wiki, id); assertNotNull(page); } - private void loadPage(String title) throws WikiApiException { + private void loadPage(String title) throws WikiApiException + { Page page = wiki.getPage(title); assertNotNull(page); } - private void loadPageAndAccessFields_intern(long id) throws WikiApiException { + private void loadPageAndAccessFields_intern(long id) throws WikiApiException + { Page page = new Page(this.wiki, id); Set inLinks = page.getInlinkIDs(); assertNotNull(inLinks); @@ -157,7 +168,8 @@ private void loadPageAndAccessFields_intern(long id) throws WikiApiException { assertNotNull(text); } - private void loadPageAndAccessFields_extern(long id) throws WikiApiException { + private void loadPageAndAccessFields_extern(long id) throws WikiApiException + { Page page = new Page(this.wiki, id); Set inLinks = page.getInlinks(); assertNotNull(inLinks); @@ -168,14 +180,14 @@ private void loadPageAndAccessFields_extern(long id) throws WikiApiException { } /** - * This is a test class for the version of PageIterator, that buffers a - * certain number of pages in order to gain efficiency. - * We get the same number of pages from a Wikipedia using - * different buffer sizes and return the performance. + * This is a test class for the version of PageIterator, that buffers a certain number of pages + * in order to gain efficiency. We get the same number of pages from a Wikipedia using different + * buffer sizes and return the performance. *

* For an unbuffered iterator set bufferSize to 1. */ - void loadPageAndIterate(int numberOfPages, int bufferSize, Wikipedia wiki) { + void loadPageAndIterate(int numberOfPages, int bufferSize, Wikipedia wiki) + { long from = System.currentTimeMillis(); Iterator pages = wiki.getPages(bufferSize).iterator(); int counter = 0; diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java index e4dea46b..2c46398d 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Category.java @@ -28,377 +28,413 @@ import org.hibernate.Session; import org.hibernate.type.StandardBasicTypes; -public class Category implements WikiConstants { - - private final CategoryDAO catDAO; - private org.dkpro.jwpl.api.hibernate.Category hibernateCategory; - private final Wikipedia wiki; - - - /** - * Creates a category object. - * - * @param wiki The wikipedia object. - * @param id The hibernate id of the category. - * @throws WikiPageNotFoundException If the category does not exist. - */ - protected Category(Wikipedia wiki, long id) throws WikiPageNotFoundException { - this.wiki = wiki; - catDAO = new CategoryDAO(wiki); - createCategory(id); - } - - /** - * Creates a category object. - * - * @param wiki The wikipedia object. - * @param pageID The pageID of the category. - * @throws WikiPageNotFoundException If the category does not exist. - */ - protected Category(Wikipedia wiki, int pageID) throws WikiPageNotFoundException { - this.wiki = wiki; - catDAO = new CategoryDAO(wiki); - createCategory(pageID); - } - - /** - * Creates a category object. - * - * @param wiki The wikipedia object. - * @param pName The name of the category. - * @throws WikiPageNotFoundException If the category does not exist. - */ - public Category(Wikipedia wiki, String pName) throws WikiApiException { - if (pName == null || pName.length() == 0) { - throw new WikiPageNotFoundException(); +public class Category + implements WikiConstants +{ + + private final CategoryDAO catDAO; + private org.dkpro.jwpl.api.hibernate.Category hibernateCategory; + private final Wikipedia wiki; + + /** + * Creates a category object. + * + * @param wiki + * The wikipedia object. + * @param id + * The hibernate id of the category. + * @throws WikiPageNotFoundException + * If the category does not exist. + */ + protected Category(Wikipedia wiki, long id) throws WikiPageNotFoundException + { + this.wiki = wiki; + catDAO = new CategoryDAO(wiki); + createCategory(id); } - this.wiki = wiki; - catDAO = new CategoryDAO(wiki); - Title catTitle = new Title(pName); - createCategory(catTitle); - } - - /** - * @see Category#Category(Wikipedia, long) - */ - private void createCategory(long id) throws WikiPageNotFoundException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernateCategory = catDAO.findById(id); - session.getTransaction().commit(); - - if (hibernateCategory == null) { - throw new WikiPageNotFoundException("No category with id " + id + " was found."); + + /** + * Creates a category object. + * + * @param wiki + * The wikipedia object. + * @param pageID + * The pageID of the category. + * @throws WikiPageNotFoundException + * If the category does not exist. + */ + protected Category(Wikipedia wiki, int pageID) throws WikiPageNotFoundException + { + this.wiki = wiki; + catDAO = new CategoryDAO(wiki); + createCategory(pageID); } - } - - /** - * @see Category#Category(Wikipedia, int) - */ - private void createCategory(int pageID) throws WikiPageNotFoundException { - createCategory(wiki.__getCategoryHibernateId(pageID)); - } - - /** - * @see Category#Category(Wikipedia, String) - */ - private void createCategory(Title title) throws WikiPageNotFoundException { - String name = title.getWikiStyleTitle(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - - Integer returnValue; - - String query = "select cat.pageId from Category as cat where cat.name = :name"; - if (wiki.getDatabaseConfiguration().supportsCollation()) { - query += Wikipedia.SQL_COLLATION; + + /** + * Creates a category object. + * + * @param wiki + * The wikipedia object. + * @param pName + * The name of the category. + * @throws WikiPageNotFoundException + * If the category does not exist. + */ + public Category(Wikipedia wiki, String pName) throws WikiApiException + { + if (pName == null || pName.length() == 0) { + throw new WikiPageNotFoundException(); + } + this.wiki = wiki; + catDAO = new CategoryDAO(wiki); + Title catTitle = new Title(pName); + createCategory(catTitle); } - returnValue = session.createNativeQuery(query, Integer.class) - .setParameter("name", name, StandardBasicTypes.STRING) - .uniqueResult(); - session.getTransaction().commit(); - - // if there is no category with this name, the hibernateCategory is null - if (returnValue == null) { - hibernateCategory = null; - throw new WikiPageNotFoundException("No category with name " + name + " was found."); - } else { - int pageID = returnValue; - createCategory(pageID); + + /** + * @see Category#Category(Wikipedia, long) + */ + private void createCategory(long id) throws WikiPageNotFoundException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernateCategory = catDAO.findById(id); + session.getTransaction().commit(); + + if (hibernateCategory == null) { + throw new WikiPageNotFoundException("No category with id " + id + " was found."); + } } - } - - /** - * This returns the internal id. Do not confuse this with the pageId. - * - * @return Returns the internal id. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. - */ - long __getId() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - long id = hibernateCategory.getId(); - session.getTransaction().commit(); - return id; - } - - /** - * @return A unique page id. - */ - public int getPageId() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - int pageID = hibernateCategory.getPageId(); - session.getTransaction().commit(); - return pageID; - } - - /** - * @return A set containing parents (super categories) of this category. - */ - public Set getParents() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); - session.getTransaction().commit(); - - Set categories = new HashSet<>(); - for (int pageID : tmpSet) { - categories.add(this.wiki.getCategory(pageID)); + + /** + * @see Category#Category(Wikipedia, int) + */ + private void createCategory(int pageID) throws WikiPageNotFoundException + { + createCategory(wiki.__getCategoryHibernateId(pageID)); } - return categories; - } - - /** - * This is a more efficient shortcut for writing "getParents().size()", as that would require to load all the parents first. - * - * @return The number of parents of this category. - */ - public int getNumberOfParents() { - int nrOfInlinks = 0; - - long id = this.__getId(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(inLinks) from category_inlinks where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("id", id, StandardBasicTypes.LONG) - .uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfInlinks = returnValue.intValue(); + + /** + * @see Category#Category(Wikipedia, String) + */ + private void createCategory(Title title) throws WikiPageNotFoundException + { + String name = title.getWikiStyleTitle(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + + Integer returnValue; + + String query = "select cat.pageId from Category as cat where cat.name = :name"; + if (wiki.getDatabaseConfiguration().supportsCollation()) { + query += Wikipedia.SQL_COLLATION; + } + returnValue = session.createNativeQuery(query, Integer.class) + .setParameter("name", name, StandardBasicTypes.STRING).uniqueResult(); + session.getTransaction().commit(); + + // if there is no category with this name, the hibernateCategory is null + if (returnValue == null) { + hibernateCategory = null; + throw new WikiPageNotFoundException("No category with name " + name + " was found."); + } + else { + int pageID = returnValue; + createCategory(pageID); + } } - return nrOfInlinks; - } - - /** - * @return A set containing the IDs of the parents of this category. - */ - public Set getParentIDs() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); - session.getTransaction().commit(); - return tmpSet; - } - - /** - * @return A set containing the children (subcategories) of this category. - */ - public Set getChildren() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); - session.getTransaction().commit(); - - Set categories = new HashSet<>(); - for (int pageID : tmpSet) { - categories.add(this.wiki.getCategory(pageID)); + + /** + * This returns the internal id. Do not confuse this with the pageId. + * + * @return Returns the internal id. + */ + /* + * Note well: Access is limited to package-private here intentionally, as the database ID is + * considered framework-internal use. + */ + long __getId() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + long id = hibernateCategory.getId(); + session.getTransaction().commit(); + return id; } - return categories; - } - - /** - * This is a more efficient shortcut for writing "getChildren().size()", as that would require to load all the children first. - * - * @return The number of children of this category. - */ - public int getNumberOfChildren() { - int nrOfOutlinks = 0; - - long id = this.__getId(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(outLinks) from category_outlinks where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("id", id, StandardBasicTypes.LONG) - .uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfOutlinks = returnValue.intValue(); + + /** + * @return A unique page id. + */ + public int getPageId() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + int pageID = hibernateCategory.getPageId(); + session.getTransaction().commit(); + return pageID; } - return nrOfOutlinks; - } - - /** - * @return A set containing the IDs of the children of this category. - */ - public Set getChildrenIDs() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); - session.getTransaction().commit(); - return tmpSet; - } - - /** - * @return The title of the category. - * @throws WikiTitleParsingException Thrown if errors occurred. - */ - public Title getTitle() throws WikiTitleParsingException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - String name = hibernateCategory.getName(); - session.getTransaction().commit(); - Title title = new Title(name); - return title; - } - - /** - * @return The set of articles that are categorized under this category. - * @throws WikiApiException Thrown if errors occurred. - */ - public Set getArticles() throws WikiApiException { - Set tmpSet = getArticleIds(); - Set pages = new HashSet<>(); - for (int pageID : tmpSet) { - pages.add(this.wiki.getPage(pageID)); + + /** + * @return A set containing parents (super categories) of this category. + */ + public Set getParents() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); + session.getTransaction().commit(); + + Set categories = new HashSet<>(); + for (int pageID : tmpSet) { + categories.add(this.wiki.getCategory(pageID)); + } + return categories; } - return pages; - } - - /** - * @return The set of article ids that are categorized under this category. - */ - public Set getArticleIds() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateCategory, LockMode.NONE); - Set tmpSet = new HashSet<>(hibernateCategory.getPages()); - session.getTransaction().commit(); - - return tmpSet; - } - - /** - * This is a more efficient shortcut for writing "getPages().size()", as that would require to load all the pages first. - * - * @return The number of pages. - */ - public int getNumberOfPages() { - int nrOfPages = 0; - - long id = this.__getId(); - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(pages) from category_pages where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("id", id, StandardBasicTypes.LONG) - .uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfPages = returnValue.intValue(); + + /** + * This is a more efficient shortcut for writing "getParents().size()", as that would require to + * load all the parents first. + * + * @return The number of parents of this category. + */ + public int getNumberOfParents() + { + int nrOfInlinks = 0; + + long id = this.__getId(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(inLinks) from category_inlinks where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue != null) { + nrOfInlinks = returnValue.intValue(); + } + return nrOfInlinks; } - return nrOfPages; - } - - /** - * This method exposes implementation details and should not be made public. - * It is used for performance tuning. - * - * @return The set of pages that are categorized under this category. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as it is API-internal use only. - */ - Set __getPages() { - return getArticleIds(); - } - - /** - * Returns *all* recursively collected descendants (=subcategories) of this category. - * - * @return An iterable of all descendants (=subcategories) of this category. - */ - public Iterable getDescendants() { - return new CategoryDescendantsIterable(wiki, this); - } - - /** - * Returns *all* recursively collected descendants (=subcategories) of this category. - * - * @return An iterable of all descendants (=subcategories) of this category. - */ - protected Iterable getDescendants(int bufferSize) { - return new CategoryDescendantsIterable(wiki, bufferSize, this); - } - - /** - * Returns the siblings of this category. - * - * @return Returns the siblings of this category or null, if there are none. - */ - public Set getSiblings() { - Set siblings = new HashSet<>(); - - // add siblings - for (Category parent : this.getParents()) { - siblings.addAll(parent.getChildren()); + + /** + * @return A set containing the IDs of the parents of this category. + */ + public Set getParentIDs() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getInLinks()); + session.getTransaction().commit(); + return tmpSet; + } + + /** + * @return A set containing the children (subcategories) of this category. + */ + public Set getChildren() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); + session.getTransaction().commit(); + + Set categories = new HashSet<>(); + for (int pageID : tmpSet) { + categories.add(this.wiki.getCategory(pageID)); + } + return categories; + } + + /** + * This is a more efficient shortcut for writing "getChildren().size()", as that would require + * to load all the children first. + * + * @return The number of children of this category. + */ + public int getNumberOfChildren() + { + int nrOfOutlinks = 0; + + long id = this.__getId(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(outLinks) from category_outlinks where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue != null) { + nrOfOutlinks = returnValue.intValue(); + } + return nrOfOutlinks; + } + + /** + * @return A set containing the IDs of the children of this category. + */ + public Set getChildrenIDs() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getOutLinks()); + session.getTransaction().commit(); + return tmpSet; } - // remove this category from list - siblings.remove(this); - - return siblings; - } - - /** - * @return A string with information about a {@link Category}. - * @throws WikiApiException Thrown if errors occurred. - */ - protected String getCategoryInfo() throws WikiApiException { - StringBuilder sb = new StringBuilder(1000); - - sb.append("ID : ").append(__getId()).append(LF); - sb.append("PageID : ").append(getPageId()).append(LF); - sb.append("Name : ").append(getTitle()).append(LF); - sb.append("In-Links").append(LF); - for (Category parent : getParents()) { - sb.append(" ").append(parent.getTitle()).append(LF); + /** + * @return The title of the category. + * @throws WikiTitleParsingException + * Thrown if errors occurred. + */ + public Title getTitle() throws WikiTitleParsingException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + String name = hibernateCategory.getName(); + session.getTransaction().commit(); + Title title = new Title(name); + return title; } - sb.append("Out-Links").append(LF); - for (Category child : getChildren()) { - sb.append(" ").append(child.getTitle()).append(LF); + + /** + * @return The set of articles that are categorized under this category. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Set getArticles() throws WikiApiException + { + Set tmpSet = getArticleIds(); + Set pages = new HashSet<>(); + for (int pageID : tmpSet) { + pages.add(this.wiki.getPage(pageID)); + } + return pages; } - sb.append("Pages").append(LF); - for (Page page : getArticles()) { - sb.append(" ").append(page.getTitle()).append(LF); + + /** + * @return The set of article ids that are categorized under this category. + */ + public Set getArticleIds() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateCategory, LockMode.NONE); + Set tmpSet = new HashSet<>(hibernateCategory.getPages()); + session.getTransaction().commit(); + + return tmpSet; + } + + /** + * This is a more efficient shortcut for writing "getPages().size()", as that would require to + * load all the pages first. + * + * @return The number of pages. + */ + public int getNumberOfPages() + { + int nrOfPages = 0; + + long id = this.__getId(); + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(pages) from category_pages where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue != null) { + nrOfPages = returnValue.intValue(); + } + return nrOfPages; + } + + /** + * This method exposes implementation details and should not be made public. It is used for + * performance tuning. + * + * @return The set of pages that are categorized under this category. + */ + /* + * Note well: Access is limited to package-private here intentionally, as it is API-internal use + * only. + */ + Set __getPages() + { + return getArticleIds(); + } + + /** + * Returns *all* recursively collected descendants (=subcategories) of this category. + * + * @return An iterable of all descendants (=subcategories) of this category. + */ + public Iterable getDescendants() + { + return new CategoryDescendantsIterable(wiki, this); + } + + /** + * Returns *all* recursively collected descendants (=subcategories) of this category. + * + * @return An iterable of all descendants (=subcategories) of this category. + */ + protected Iterable getDescendants(int bufferSize) + { + return new CategoryDescendantsIterable(wiki, bufferSize, this); + } + + /** + * Returns the siblings of this category. + * + * @return Returns the siblings of this category or null, if there are none. + */ + public Set getSiblings() + { + Set siblings = new HashSet<>(); + + // add siblings + for (Category parent : this.getParents()) { + siblings.addAll(parent.getChildren()); + } + + // remove this category from list + siblings.remove(this); + + return siblings; + } + + /** + * @return A string with information about a {@link Category}. + * @throws WikiApiException + * Thrown if errors occurred. + */ + protected String getCategoryInfo() throws WikiApiException + { + StringBuilder sb = new StringBuilder(1000); + + sb.append("ID : ").append(__getId()).append(LF); + sb.append("PageID : ").append(getPageId()).append(LF); + sb.append("Name : ").append(getTitle()).append(LF); + sb.append("In-Links").append(LF); + for (Category parent : getParents()) { + sb.append(" ").append(parent.getTitle()).append(LF); + } + sb.append("Out-Links").append(LF); + for (Category child : getChildren()) { + sb.append(" ").append(child.getTitle()).append(LF); + } + sb.append("Pages").append(LF); + for (Page page : getArticles()) { + sb.append(" ").append(page.getTitle()).append(LF); + } + return sb.toString(); } - return sb.toString(); - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java index 347f3087..6ef5ce87 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterable.java @@ -22,32 +22,36 @@ /** * An {@link Iterable} over category objects retrieved by {@link Category#getDescendants()}. */ -public class CategoryDescendantsIterable implements Iterable { +public class CategoryDescendantsIterable + implements Iterable +{ - private final Wikipedia wiki; - private final Category startCategory; + private final Wikipedia wiki; + private final Category startCategory; - /* - * The size of the page buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single article. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 25. - */ - private int bufferSize = 25; + /* + * The size of the page buffer. With bufferSize = 1, a database connection is needed for + * retrieving a single article. Higher bufferSize gives better performance, but needs memory. + * Initialize it with 25. + */ + private int bufferSize = 25; - public CategoryDescendantsIterable(Wikipedia wiki, Category startCategory) { - this.wiki = wiki; - this.startCategory = startCategory; - } + public CategoryDescendantsIterable(Wikipedia wiki, Category startCategory) + { + this.wiki = wiki; + this.startCategory = startCategory; + } - public CategoryDescendantsIterable(Wikipedia wiki, int bufferSize, Category startCategory) { - this.wiki = wiki; - this.bufferSize = bufferSize; - this.startCategory = startCategory; - } + public CategoryDescendantsIterable(Wikipedia wiki, int bufferSize, Category startCategory) + { + this.wiki = wiki; + this.bufferSize = bufferSize; + this.startCategory = startCategory; + } - @Override - public Iterator iterator() { - return new CategoryDescendantsIterator(wiki, bufferSize, startCategory); - } + @Override + public Iterator iterator() + { + return new CategoryDescendantsIterator(wiki, bufferSize, startCategory); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java index 165e1f3c..04f51446 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryDescendantsIterator.java @@ -31,150 +31,167 @@ /** * An {@link Iterator} over category objects retrieved by {@link Category#getDescendants()}. */ -public class CategoryDescendantsIterator implements Iterator { +public class CategoryDescendantsIterator + implements Iterator +{ - private final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - private final Wikipedia wiki; + private final Wikipedia wiki; - private final CategoryBuffer buffer; - - /** - * Contains all category ids that have not been expanded, yet. - */ - private final Set notExpandedCategories; - - /** - * As we do not inspect the whole graph at once now, we need a way to check whether a node was already expanded, to avoid infinite loops. - */ - private final Set expandedCategoryIds; - - public CategoryDescendantsIterator(Wikipedia wiki, int bufferSize, Category startCategory) { - this.wiki = wiki; - buffer = new CategoryBuffer(bufferSize); - notExpandedCategories = new HashSet<>(); - // initialize with children of start category - for (Category catItem : startCategory.getChildren()) { - notExpandedCategories.add(catItem.getPageId()); - } - - expandedCategoryIds = new HashSet<>(); - } - - @Override - public boolean hasNext() { - return buffer.hasNext(); - } - - @Override - public Category next() { - return buffer.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * Buffers categories in a list. - */ - class CategoryBuffer { - - private final List buffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private int dataOffset; // the overall offset in the data - - public CategoryBuffer(int bufferSize) { - this.maxBufferSize = bufferSize; - this.buffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.dataOffset = 0; - - //TODO test whether this works when zero pages are retrieved - // we can test this here using a unit test that retrieves no descendants! - } + private final CategoryBuffer buffer; /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * - * @return True, if there are pages left. False otherwise. + * Contains all category ids that have not been expanded, yet. */ - public boolean hasNext() { - if (bufferOffset < bufferFillSize) { - return true; - } else { - return this.fillBuffer(); - } - } + private final Set notExpandedCategories; /** - * @return The next Category or null if no more categories are available. + * As we do not inspect the whole graph at once now, we need a way to check whether a node was + * already expanded, to avoid infinite loops. */ - public Category next() { - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } else { - // if it cannot be filled => return null - return null; - } + private final Set expandedCategoryIds; + + public CategoryDescendantsIterator(Wikipedia wiki, int bufferSize, Category startCategory) + { + this.wiki = wiki; + buffer = new CategoryBuffer(bufferSize); + notExpandedCategories = new HashSet<>(); + // initialize with children of start category + for (Category catItem : startCategory.getChildren()) { + notExpandedCategories.add(catItem.getPageId()); + } + + expandedCategoryIds = new HashSet<>(); } - private Category getBufferElement() { - Category cat = buffer.get(bufferOffset); - bufferOffset++; - dataOffset++; - return cat; + @Override + public boolean hasNext() + { + return buffer.hasNext(); } - private boolean fillBuffer() { + @Override + public Category next() + { + return buffer.next(); + } - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; + @Override + public void remove() + { + throw new UnsupportedOperationException(); + } - // add not expanded categories to queue - List queue = new LinkedList<>(notExpandedCategories); + /** + * Buffers categories in a list. + */ + class CategoryBuffer + { + + private final List buffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the + // database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private int dataOffset; // the overall offset in the data + + public CategoryBuffer(int bufferSize) + { + this.maxBufferSize = bufferSize; + this.buffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.dataOffset = 0; + + // TODO test whether this works when zero pages are retrieved + // we can test this here using a unit test that retrieves no descendants! + } - // expand until buffer size is reached - while (!queue.isEmpty() && buffer.size() < maxBufferSize) { - // remove first element from queue - Category currentCat = wiki.getCategory(queue.get(0)); - queue.remove(0); + /** + * If there are elements in the buffer left, then return true. If the end of the filled + * buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() + { + if (bufferOffset < bufferFillSize) { + return true; + } + else { + return this.fillBuffer(); + } + } - // if the node was not previously expanded - if (!expandedCategoryIds.contains(currentCat.getPageId())) { - buffer.add(currentCat); - notExpandedCategories.remove(currentCat.getPageId()); - expandedCategoryIds.add(currentCat.getPageId()); + /** + * @return The next Category or null if no more categories are available. + */ + public Category next() + { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } + else { + // if it cannot be filled => return null + return null; + } + } - logger.debug("buf: " + buffer.size()); - logger.debug("notExp: " + notExpandedCategories); - logger.debug("exp: " + expandedCategoryIds); + private Category getBufferElement() + { + Category cat = buffer.get(bufferOffset); + bufferOffset++; + dataOffset++; + return cat; + } - for (Category child : currentCat.getChildren()) { - queue.add(child.getPageId()); - notExpandedCategories.add(child.getPageId()); - } + private boolean fillBuffer() + { + + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + // add not expanded categories to queue + List queue = new LinkedList<>(notExpandedCategories); + + // expand until buffer size is reached + while (!queue.isEmpty() && buffer.size() < maxBufferSize) { + // remove first element from queue + Category currentCat = wiki.getCategory(queue.get(0)); + queue.remove(0); + + // if the node was not previously expanded + if (!expandedCategoryIds.contains(currentCat.getPageId())) { + buffer.add(currentCat); + notExpandedCategories.remove(currentCat.getPageId()); + expandedCategoryIds.add(currentCat.getPageId()); + + logger.debug("buf: " + buffer.size()); + logger.debug("notExp: " + notExpandedCategories); + logger.debug("exp: " + expandedCategoryIds); + + for (Category child : currentCat.getChildren()) { + queue.add(child.getPageId()); + notExpandedCategories.add(child.getPageId()); + } + } + } + + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } + else { + return false; + } } - } - - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } else { - return false; - } } - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java index 92edf585..f6029032 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraph.java @@ -53,1639 +53,1870 @@ import org.slf4j.LoggerFactory; /** - * The category graph is constructed from the links connecting Wikipedia categories. - * It provides various accessors and graph algorithms. + * The category graph is constructed from the links connecting Wikipedia categories. It provides + * various accessors and graph algorithms. */ -public class CategoryGraph implements WikiConstants, Serializable { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - static final long serialVersionUID = 1L; - - // the wikipedia object - private Wikipedia wiki; - - // the category graph - private DefaultDirectedGraph graph; - // the category graph - private AsUndirectedGraph undirectedGraph; - - // a map holding the degree distribution of the graph - private Map degreeDistribution; - - // number of nodes in the graph - private int numberOfNodes; - - // number of edges in the graph - private int numberOfEdges; - - // A map holding the (recursive) number of hyponyms for each node. - // Recursive means that the hyponyms of hyponyms are also taken into account. - private Map hyponymCountMap = null; - private final String hyponymCountMapFilename = "hypoCountMap"; - - // a mapping from all nodes to a list of nodes on the path to the root - private Map> rootPathMap = null; - private final String rootPathMapFilename = "rootPathMap"; - - private double averageShortestPathLength = Double.NEGATIVE_INFINITY; - private double diameter = Double.NEGATIVE_INFINITY; - private double averageDegree = Double.NEGATIVE_INFINITY; - private double clusterCoefficient = Double.NEGATIVE_INFINITY; - private double depth = Double.NEGATIVE_INFINITY; - - - /** - * Creates an empty {@link CategoryGraph}. You cannot do much with such a graph. - * Sometimes an empty category graph can be useful if you just need a CategoryGraph object, but do not care about its content. - */ - public CategoryGraph() throws WikiApiException { - logger.warn("Attention. You created an empty category graph. Intentionally?"); - } - - /** - * Creates an {@link CategoryGraph} using a serialized DirectedGraph object. - * - * @param pWiki A {@link Wikipedia} object. - * @param location The location of the serialized graph - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, File location) throws WikiApiException { - try { - constructCategoryGraph(pWiki, GraphSerialization.loadGraph(location)); - } catch (IOException | ClassNotFoundException e) { - throw new WikiApiException(e); +public class CategoryGraph + implements WikiConstants, Serializable +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + static final long serialVersionUID = 1L; + + // the wikipedia object + private Wikipedia wiki; + + // the category graph + private DefaultDirectedGraph graph; + // the category graph + private AsUndirectedGraph undirectedGraph; + + // a map holding the degree distribution of the graph + private Map degreeDistribution; + + // number of nodes in the graph + private int numberOfNodes; + + // number of edges in the graph + private int numberOfEdges; + + // A map holding the (recursive) number of hyponyms for each node. + // Recursive means that the hyponyms of hyponyms are also taken into account. + private Map hyponymCountMap = null; + private final String hyponymCountMapFilename = "hypoCountMap"; + + // a mapping from all nodes to a list of nodes on the path to the root + private Map> rootPathMap = null; + private final String rootPathMapFilename = "rootPathMap"; + + private double averageShortestPathLength = Double.NEGATIVE_INFINITY; + private double diameter = Double.NEGATIVE_INFINITY; + private double averageDegree = Double.NEGATIVE_INFINITY; + private double clusterCoefficient = Double.NEGATIVE_INFINITY; + private double depth = Double.NEGATIVE_INFINITY; + + /** + * Creates an empty {@link CategoryGraph}. You cannot do much with such a graph. Sometimes an + * empty category graph can be useful if you just need a CategoryGraph object, but do not care + * about its content. + */ + public CategoryGraph() throws WikiApiException + { + logger.warn("Attention. You created an empty category graph. Intentionally?"); } - } - - /** - * Creates a {@link CategoryGraph} object using all categories of the given Wikipedia. - * - * @param pWiki A {@link Wikipedia} object. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki) throws WikiApiException { - constructCategoryGraph(pWiki, pWiki.__getCategories(), null); - } - - /** - * Creates a CategoryGraph object using all categories, but filters all categories starting with strings contained in the filterList. - * - * @param pWiki The Wikipedia object. - * @param filterList A list of strings. All categories starting with or matching such a string are not added to the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, List filterList) throws WikiApiException { - constructCategoryGraph(pWiki, pWiki.__getCategories(), filterList); - } - - /** - * Creates a CategoryGraph object using the categories given by the iterable - * - * @param pWiki The Wikipedia object. - * @param categories An iterable of the categories to use for construction of the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, Iterable categories) throws WikiApiException { - Set pageIDs = new HashSet<>(); - while (categories.iterator().hasNext()) { - pageIDs.add(categories.iterator().next().getPageId()); - } - constructCategoryGraph(pWiki, pageIDs, null); - } - - /** - * Creates a CategoryGraph object using the categories given by the iterable, but filters all categories starting with strings contained in the filterList - * - * @param pWiki The Wikipedia object. - * @param categories An iterable of the categories to use for construction of the category graph. - * @param filterList A list of strings. All categories starting with or matching such a string are not added to the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - public CategoryGraph(Wikipedia pWiki, Iterable categories, List filterList) throws WikiApiException { - Set pageIDs = new HashSet<>(); - while (categories.iterator().hasNext()) { - pageIDs.add(categories.iterator().next().getPageId()); - } - constructCategoryGraph(pWiki, pageIDs, filterList); - } - - /** - * Creates a category graph using a subset (that may also be the full set :) of the categories. - * - * @param pWiki The wiki object. - * @param pPageIDs A set of pageIDs of the category pages that should be used to build the category graph. - * @throws WikiApiException Thrown if errors occurred. - */ - protected CategoryGraph(Wikipedia pWiki, Set pPageIDs) throws WikiApiException { - constructCategoryGraph(pWiki, pPageIDs, null); - } - - public CategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) throws WikiApiException { - constructCategoryGraph(pWiki, pGraph); - } - - private void constructCategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) throws WikiApiException { - this.wiki = pWiki; - this.graph = pGraph; - this.numberOfNodes = this.graph.vertexSet().size(); - this.numberOfEdges = this.graph.edgeSet().size(); - this.undirectedGraph = new AsUndirectedGraph<>(this.graph); - } - - private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs, List filterList) throws WikiApiException { - // create the graph as a directed Graph - // algorithms that need to be called on a undirected graph or should ignore direction - // can be called on an AsUndirectedGraph view of the directed graph - graph = new DefaultDirectedGraph<>(DefaultEdge.class); - - wiki = pWiki; - - degreeDistribution = new HashMap<>(); - - for (int pageID : pPageIDs) { - if (filterList != null) { - long hibernateID = pWiki.__getCategoryHibernateId(pageID); - if (hibernateID == -1) { - throw new WikiApiException(pageID + " is not a valid pageID"); - } - - Category cat; + + /** + * Creates an {@link CategoryGraph} using a serialized DirectedGraph object. + * + * @param pWiki + * A {@link Wikipedia} object. + * @param location + * The location of the serialized graph + * @throws WikiApiException + * Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, File location) throws WikiApiException + { try { - cat = new Category(this.wiki, hibernateID); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category not found"); + constructCategoryGraph(pWiki, GraphSerialization.loadGraph(location)); } - - if (matchesFilter(cat, filterList)) { - continue; + catch (IOException | ClassNotFoundException e) { + throw new WikiApiException(e); } - } + } - graph.addVertex(pageID); + /** + * Creates a {@link CategoryGraph} object using all categories of the given Wikipedia. + * + * @param pWiki + * A {@link Wikipedia} object. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki) throws WikiApiException + { + constructCategoryGraph(pWiki, pWiki.__getCategories(), null); } + /** + * Creates a CategoryGraph object using all categories, but filters all categories starting with + * strings contained in the filterList. + * + * @param pWiki + * The Wikipedia object. + * @param filterList + * A list of strings. All categories starting with or matching such a string are not + * added to the category graph. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, List filterList) throws WikiApiException + { + constructCategoryGraph(pWiki, pWiki.__getCategories(), filterList); + } - numberOfNodes = graph.vertexSet().size(); - - // add edges - logger.info(OS.getUsedMemory() + " MB memory used."); - int progress = 0; - for (int pageID : graph.vertexSet()) { - progress++; - ApiUtilities.printProgressInfo(progress, pPageIDs.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Adding edges"); - - long hibernateID = pWiki.__getCategoryHibernateId(pageID); - if (hibernateID == -1) { - throw new WikiApiException(pageID + " is not a valid pageID"); - } - - // get the category - Category cat; - try { - cat = new Category(this.wiki, hibernateID); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category not found"); - } - - // get parents and children - // if the corresponding nodes are in the graph (it could be a subset) => add them to the graph - Set inLinks = cat.getParentIDs(); - Set outLinks = cat.getChildrenIDs(); - - // add edges - // If an edge already exits, it is silenty ignored by JGraphT. So we do not have to check this. - for (int inLink : inLinks) { - if (graph.vertexSet().contains(inLink)) { - if (inLink == pageID) { - logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); - } else { - graph.addEdge(inLink, pageID); - } - } - } - for (int outLink : outLinks) { - if (graph.vertexSet().contains(outLink)) { - if (outLink == pageID) { - logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); - } else { - graph.addEdge(pageID, outLink); - } - } - } + /** + * Creates a CategoryGraph object using the categories given by the iterable + * + * @param pWiki + * The Wikipedia object. + * @param categories + * An iterable of the categories to use for construction of the category graph. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, Iterable categories) throws WikiApiException + { + Set pageIDs = new HashSet<>(); + while (categories.iterator().hasNext()) { + pageIDs.add(categories.iterator().next().getPageId()); + } + constructCategoryGraph(pWiki, pageIDs, null); } - numberOfEdges = graph.edgeSet().size(); - - logger.info("Added " + this.getNumberOfNodes() + " nodes."); - logger.info("Added " + this.getNumberOfEdges() + " edges."); - - CycleHandler cycleHandler = new CycleHandler(wiki, this); - logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); - cycleHandler.removeCycles(); - logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); - - this.numberOfEdges = this.graph.edgeSet().size(); - this.undirectedGraph = new AsUndirectedGraph<>(this.graph); - - } - -//// older version without filterList -// private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs) throws WikiApiException { -// // create the graph as a directed Graph -// // algorithms that need to be called on a undirected graph or should ignore direction -// // can be called on an AsUndirectedGraph view of the directed graph -// graph = new DefaultDirectedGraph(DefaultEdge.class); -// -// wiki = pWiki; -// -// degreeDistribution = new HashMap(); -// -// for (int pageID : pPageIDs) { -// graph.addVertex(pageID); -// } -// -// // add edges -// logger.info(OS.getUsedMemory() + " MB memory used."); -// int progress = 0; -// for (int pageID : pPageIDs) { -// progress++; -// ApiUtilities.printProgressInfo(progress, pPageIDs.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Adding edges"); -// -// long hibernateID = pWiki.__getHibernateId(pageID); -// if (hibernateID == -1) { -// throw new WikiApiException(pageID + " is not a valid pageID"); -// } -// -// // get the category -// Category cat; -// try { -// cat = new Category(this.wiki, hibernateID); -// } catch (WikiPageNotFoundException e) { -// throw new WikiApiException("Category not found"); -// } -// -// // get parents and children -// // if the corresponding nodes are in the graph (it could be a subset) => add them to the graph -// Set inLinks = cat.__getInlinkIDs(); -// Set outLinks = cat.__getOutlinkIDs(); -// -// // add edges -// // If an edge already exits, it is silenty ignored by JGraphT. So we do not have to check this. -// for (int inLink : inLinks) { -// if (pPageIDs.contains(inLink)) { -// if (inLink == pageID) { -// logger.warn("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); -// } -// else { -// graph.addEdge(inLink, pageID); -// } -// } -// } -// for (int outLink : outLinks) { -// if (pPageIDs.contains(outLink)) { -// if (outLink == pageID) { -// logger.warn("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); -// } -// else { -// graph.addEdge(pageID, outLink); -// } -// } -// } -// } -// -// logger.info("Added " + this.getNumberOfNodes() + " nodes."); -// logger.info("Added " + this.getNumberOfEdges() + " edges."); -// -// CycleHandler cycleHandler = new CycleHandler(wiki, this); -// logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); -// cycleHandler.removeCycles(); -// logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); -// -// this.depth = getDepth(); -// logger.info(this.depth); -// } - - - /** - * Checks whether the category title matches the filter (a filter matches a string, if the string starts with the filter expression). - * - * @param cat A category. - * @param filterList A list of filter strings. - * @return True, if the category title starts with or is equal to a string in the filter list. False, otherwise. - * @throws WikiTitleParsingException Thrown if errors occurred. - */ - private boolean matchesFilter(Category cat, List filterList) throws WikiTitleParsingException { - String categoryTitle = cat.getTitle().getPlainTitle(); - for (String filter : filterList) { - if (categoryTitle.startsWith(filter)) { - logger.info(categoryTitle + " starts with " + filter + " => removing"); - return true; - } + /** + * Creates a CategoryGraph object using the categories given by the iterable, but filters all + * categories starting with strings contained in the filterList + * + * @param pWiki + * The Wikipedia object. + * @param categories + * An iterable of the categories to use for construction of the category graph. + * @param filterList + * A list of strings. All categories starting with or matching such a string are not + * added to the category graph. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public CategoryGraph(Wikipedia pWiki, Iterable categories, List filterList) + throws WikiApiException + { + Set pageIDs = new HashSet<>(); + while (categories.iterator().hasNext()) { + pageIDs.add(categories.iterator().next().getPageId()); + } + constructCategoryGraph(pWiki, pageIDs, filterList); } - return false; - } - - /** - * Gets the lowest common subsumer (LCS) of two nodes. - * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. - * Nodes that are not in the same connected component as the root node are defined to have no LCS. - * - * @param category1 The first category node. - * @param category2 The second category node. - * @return The lowest common subsumer of the two nodes, or null if there is no LCS. - */ - public Category getLCS(Category category1, Category category2) throws WikiApiException { - return getLCS(category1.getPageId(), category2.getPageId()); - } - - - /** - * Gets the lowest common subsumer (LCS) of two nodes. - * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. - * Nodes that are not in the same connected component as the root node are defined to have no LCS. - * - * @param categoryPageId1 The pageid of the first category node. - * @param categoryPageId2 The pageid of the second category node. - * @return The pageId of the lowest common subsumer of the two nodes, or null if there is no LCS. - */ - public int getLCSId(int categoryPageId1, int categoryPageId2) throws WikiApiException { - -// TODO here might be a problem concerning multiple inheritence in the category graph, if there is more than one path of equal length to the root, the method will only find one, but the other (not found) LCS may have a higher information content -// TODO is the lcs between the same node really defined or should this be handled in the measures (i.e. SR(n1,n1) = 1 per definitionem??) - if (categoryPageId1 == categoryPageId2) { - return categoryPageId1; + + /** + * Creates a category graph using a subset (that may also be the full set :) of the categories. + * + * @param pWiki + * The wiki object. + * @param pPageIDs + * A set of pageIDs of the category pages that should be used to build the category + * graph. + * @throws WikiApiException + * Thrown if errors occurred. + */ + protected CategoryGraph(Wikipedia pWiki, Set pPageIDs) throws WikiApiException + { + constructCategoryGraph(pWiki, pPageIDs, null); } - List nodeList1 = getRootPathMap().get(categoryPageId1); - List nodeList2 = getRootPathMap().get(categoryPageId2); + public CategoryGraph(Wikipedia pWiki, DefaultDirectedGraph pGraph) + throws WikiApiException + { + constructCategoryGraph(pWiki, pGraph); + } - // if one of the paths is null => return -1 - if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) { - logger.debug("One of the node lists is null or empty!"); - return -1; + private void constructCategoryGraph(Wikipedia pWiki, + DefaultDirectedGraph pGraph) + throws WikiApiException + { + this.wiki = pWiki; + this.graph = pGraph; + this.numberOfNodes = this.graph.vertexSet().size(); + this.numberOfEdges = this.graph.edgeSet().size(); + this.undirectedGraph = new AsUndirectedGraph<>(this.graph); } - logger.debug(nodeList1.toString()); - logger.debug(nodeList2.toString()); + private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs, + List filterList) + throws WikiApiException + { + // create the graph as a directed Graph + // algorithms that need to be called on a undirected graph or should ignore direction + // can be called on an AsUndirectedGraph view of the directed graph + graph = new DefaultDirectedGraph<>(DefaultEdge.class); + + wiki = pWiki; + + degreeDistribution = new HashMap<>(); + + for (int pageID : pPageIDs) { + if (filterList != null) { + long hibernateID = pWiki.__getCategoryHibernateId(pageID); + if (hibernateID == -1) { + throw new WikiApiException(pageID + " is not a valid pageID"); + } + + Category cat; + try { + cat = new Category(this.wiki, hibernateID); + } + catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category not found"); + } + + if (matchesFilter(cat, filterList)) { + continue; + } + } + + graph.addVertex(pageID); + } - // node 1 subsumes node 2 ? - for (int tmpNode2 : nodeList2) { - if (tmpNode2 == categoryPageId1) { - return categoryPageId1; - } - } + numberOfNodes = graph.vertexSet().size(); + + // add edges + logger.info(OS.getUsedMemory() + " MB memory used."); + int progress = 0; + for (int pageID : graph.vertexSet()) { + progress++; + ApiUtilities.printProgressInfo(progress, pPageIDs.size(), 10, + ApiUtilities.ProgressInfoMode.TEXT, "Adding edges"); + + long hibernateID = pWiki.__getCategoryHibernateId(pageID); + if (hibernateID == -1) { + throw new WikiApiException(pageID + " is not a valid pageID"); + } + + // get the category + Category cat; + try { + cat = new Category(this.wiki, hibernateID); + } + catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category not found"); + } + + // get parents and children + // if the corresponding nodes are in the graph (it could be a subset) => add them to the + // graph + Set inLinks = cat.getParentIDs(); + Set outLinks = cat.getChildrenIDs(); + + // add edges + // If an edge already exits, it is silenty ignored by JGraphT. So we do not have to + // check this. + for (int inLink : inLinks) { + if (graph.vertexSet().contains(inLink)) { + if (inLink == pageID) { + logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); + } + else { + graph.addEdge(inLink, pageID); + } + } + } + for (int outLink : outLinks) { + if (graph.vertexSet().contains(outLink)) { + if (outLink == pageID) { + logger.debug("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); + } + else { + graph.addEdge(pageID, outLink); + } + } + } + } + + numberOfEdges = graph.edgeSet().size(); + + logger.info("Added " + this.getNumberOfNodes() + " nodes."); + logger.info("Added " + this.getNumberOfEdges() + " edges."); + + CycleHandler cycleHandler = new CycleHandler(wiki, this); + logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); + cycleHandler.removeCycles(); + logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); + + this.numberOfEdges = this.graph.edgeSet().size(); + this.undirectedGraph = new AsUndirectedGraph<>(this.graph); - // node 2 subsumes node 1 ? - for (int tmpNode1 : nodeList1) { - if (tmpNode1 == categoryPageId2) { - return categoryPageId2; - } } - // they have a lcs ? - for (int tmpNode1 : nodeList1) { - for (int tmpNode2 : nodeList2) { - if (tmpNode1 == tmpNode2) { - return tmpNode1; + + //// older version without filterList + // private void constructCategoryGraph(Wikipedia pWiki, Set pPageIDs) throws + //// WikiApiException { + // // create the graph as a directed Graph + // // algorithms that need to be called on a undirected graph or should ignore direction + // // can be called on an AsUndirectedGraph view of the directed graph + // graph = new DefaultDirectedGraph(DefaultEdge.class); + // + // wiki = pWiki; + // + // degreeDistribution = new HashMap(); + // + // for (int pageID : pPageIDs) { + // graph.addVertex(pageID); + // } + // + // // add edges + // logger.info(OS.getUsedMemory() + " MB memory used."); + // int progress = 0; + // for (int pageID : pPageIDs) { + // progress++; + // ApiUtilities.printProgressInfo(progress, pPageIDs.size(), 10, + //// ApiUtilities.ProgressInfoMode.TEXT, "Adding edges"); + // + // long hibernateID = pWiki.__getHibernateId(pageID); + // if (hibernateID == -1) { + // throw new WikiApiException(pageID + " is not a valid pageID"); + // } + // + // // get the category + // Category cat; + // try { + // cat = new Category(this.wiki, hibernateID); + // } catch (WikiPageNotFoundException e) { + // throw new WikiApiException("Category not found"); + // } + // + // // get parents and children + // // if the corresponding nodes are in the graph (it could be a subset) => add them to the + //// graph + // Set inLinks = cat.__getInlinkIDs(); + // Set outLinks = cat.__getOutlinkIDs(); + // + // // add edges + // // If an edge already exits, it is silenty ignored by JGraphT. So we do not have to check + //// this. + // for (int inLink : inLinks) { + // if (pPageIDs.contains(inLink)) { + // if (inLink == pageID) { + // logger.warn("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); + // } + // else { + // graph.addEdge(inLink, pageID); + // } + // } + // } + // for (int outLink : outLinks) { + // if (pPageIDs.contains(outLink)) { + // if (outLink == pageID) { + // logger.warn("Self-loop for node " + pageID + " (" + cat.getTitle() + ")"); + // } + // else { + // graph.addEdge(pageID, outLink); + // } + // } + // } + // } + // + // logger.info("Added " + this.getNumberOfNodes() + " nodes."); + // logger.info("Added " + this.getNumberOfEdges() + " edges."); + // + // CycleHandler cycleHandler = new CycleHandler(wiki, this); + // logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); + // cycleHandler.removeCycles(); + // logger.info("Graph contains cycles: " + cycleHandler.containsCycle()); + // + // this.depth = getDepth(); + // logger.info(this.depth); + // } + + /** + * Checks whether the category title matches the filter (a filter matches a string, if the + * string starts with the filter expression). + * + * @param cat + * A category. + * @param filterList + * A list of filter strings. + * @return True, if the category title starts with or is equal to a string in the filter list. + * False, otherwise. + * @throws WikiTitleParsingException + * Thrown if errors occurred. + */ + private boolean matchesFilter(Category cat, List filterList) + throws WikiTitleParsingException + { + String categoryTitle = cat.getTitle().getPlainTitle(); + for (String filter : filterList) { + if (categoryTitle.startsWith(filter)) { + logger.info(categoryTitle + " starts with " + filter + " => removing"); + return true; + } } - } + return false; } - logger.debug("No lcs found."); - - return -1; - } - - /** - * Gets the lowest common subsumer (LCS) of two nodes. - * The LCS of two nodes is first node on the path to the root, that has both nodes as sons. - * Nodes that are not in the same connected component as the root node are defined to have no LCS. - * - * @param categoryPageId1 The pageid of the first category node. - * @param categoryPageId2 The pageid of the second category node. - * @return The lowest common subsumer of the two nodes, or null if there is no LCS. - */ - public Category getLCS(int categoryPageId1, int categoryPageId2) throws WikiApiException { - int lcsid = getLCSId(categoryPageId1, categoryPageId2); - return lcsid > -1 ? wiki.getCategory(getLCSId(categoryPageId1, categoryPageId2)) : null; - } - - -// /** -// * Gets the lowest common subsumer (LCS) of two nodes. -// * The LCS of two nodes is first node on their paths to the root that is shared between the nodes. -// * Nodes that are not in the same connected component as the root node are defined to have no LCS. -// * @param rootCategory The root node of the category hierarchy. -// * @param category1 The first category node. -// * @param category2 The second category node. -// * @return The lowest common subsumer of the two nodes, or null if there is no LCS. -// */ -// public Category getLCS(Category rootCategory, Category category1, Category category2) throws WikiApiException { -// -// int root = rootCategory.getPageId(); -// int node1 = category1.getPageId(); -// int node2 = category2.getPageId(); -// -//// TODO here might be a problem concerning multiple inheritence in the category graph, if there is more than one path of equal length to the root, the method will only find one, but may be the other (not found) LCS has a higher information content -// -// logger.debug("root: " + root); -// logger.debug("n1: " + node1); -// logger.debug("n2: " + node2); -// -// // if one of the nodes is not in the same connected component as the root node, we cannot get the LCS -// if (!undirectedGraph.containsVertex(node1) || !undirectedGraph.containsVertex(node2)) { -// logger.warn("Cannot get lowest common subsumer because the nodes are not in the same connected component."); -// return null; -// } -// -////TODO due to multiple inheritance there may be a non-shortest path that leads to a lcs below the root -//// this should be considered here!! -// // get the path from root node to node 1 -// List edgeList1 = DijkstraShortestPath.findPathBetween(undirectedGraph, node1, root); -// -// // get the path from root node to node 2 -// List edgeList2 = DijkstraShortestPath.findPathBetween(undirectedGraph, node2, root); -// -// // if one of the nodes is not in the same connected component as the root node, there is no path -// // return -1 in this case -// if (edgeList1 == null || edgeList2 == null) { -// return null; -// } -// -// // convert the edge lists to node sets -// List nodeList1 = edgeList2nodeList(edgeList1, root, node1); -// List nodeList2 = edgeList2nodeList(edgeList2, root, node2); -// -// logger.debug(edgeList1); -// logger.debug(edgeList2); -// logger.debug(nodeList1); -// logger.debug(nodeList2); -// -// // node 1 subsumes node 2 ? -// for (int tmpNode2 : nodeList2) { -// if (tmpNode2 == node1) { -// return wiki.__getCategory(node1); -// } -// } -// -// // node 2 subsumes node 1 ? -// for (int tmpNode1 : nodeList1) { -// if (tmpNode1 == node2) { -// return wiki.__getCategory(node2); -// } -// } -// // they have a lcs ? -// for (int tmpNode1 : nodeList1) { -// for (int tmpNode2 : nodeList2) { -// if (tmpNode1 == tmpNode2) { -// return wiki.__getCategory(tmpNode1); -// } -// } -// } -// -// return null; -// } - -// /** -// * Converts an edgeList as returned by the Dijkstra-Shortest-Path algorithm into a list of nodes on this path. -// * @param edgeList The list of edges of this path running from the searched node to the root node. -// * @return The corresponding list of nodes on the path running from the searched node to the root node. -// */ -// private List edgeList2nodeList(List edgeList, int root, int node) throws WikiApiException { -// Iterator it = edgeList.iterator(); -// -// List nodeList = new ArrayList(); -// // init with start node -// nodeList.add(node); -// int currentNode = node; -// -// while(it.hasNext()) { -// DefaultEdge currentEdge = it.next(); -// if (graph.getEdgeSource(currentEdge) != currentNode) { -// nodeList.add(graph.getEdgeSource(currentEdge)); -// currentNode = graph.getEdgeSource(currentEdge); -// } -// else if (graph.getEdgeTarget(currentEdge) != currentNode) { -// nodeList.add(graph.getEdgeTarget(currentEdge)); -// currentNode = graph.getEdgeTarget(currentEdge); -// } -// else { -// throw new WikiApiException("Path is broken"); -// } -// } -// return nodeList; -// } - - - /** - * Returns the shortest path from node to root as a list of pageIds of the nodes on the path. Node and root are included in the path node list. - * - * @param root The root node of the graph. - * @param node A node of the graph. - * @return The shortest path from node to root as a list of pagIs of the nodes on the path; or null if no path exists - * @throws WikiApiException Thrown if errors occurred. - */ - private List getPathToRoot(int root, int node) throws WikiApiException { - List pathToRoot = new LinkedList<>(); - List shortestPath = new ArrayList<>(); - - expandPath(root, node, pathToRoot, shortestPath); - - if (shortestPath.size() == 0) { - return null; - } else { - return shortestPath; + /** + * Gets the lowest common subsumer (LCS) of two nodes. The LCS of two nodes is first node on the + * path to the root, that has both nodes as sons. Nodes that are not in the same connected + * component as the root node are defined to have no LCS. + * + * @param category1 + * The first category node. + * @param category2 + * The second category node. + * @return The lowest common subsumer of the two nodes, or null if there is no LCS. + */ + public Category getLCS(Category category1, Category category2) throws WikiApiException + { + return getLCS(category1.getPageId(), category2.getPageId()); } - } - private void expandPath(int root, int currentNode, List currentPath, List shortestPath) { + /** + * Gets the lowest common subsumer (LCS) of two nodes. The LCS of two nodes is first node on the + * path to the root, that has both nodes as sons. Nodes that are not in the same connected + * component as the root node are defined to have no LCS. + * + * @param categoryPageId1 + * The pageid of the first category node. + * @param categoryPageId2 + * The pageid of the second category node. + * @return The pageId of the lowest common subsumer of the two nodes, or null if there is no + * LCS. + */ + public int getLCSId(int categoryPageId1, int categoryPageId2) throws WikiApiException + { + + // TODO here might be a problem concerning multiple inheritence in the category graph, if + // there is more than one path of equal length to the root, the method will only find one, + // but the other (not found) LCS may have a higher information content + // TODO is the lcs between the same node really defined or should this be handled in the + // measures (i.e. SR(n1,n1) = 1 per definitionem??) + if (categoryPageId1 == categoryPageId2) { + return categoryPageId1; + } - // add the current node to the path - currentPath.add(currentNode); + List nodeList1 = getRootPathMap().get(categoryPageId1); + List nodeList2 = getRootPathMap().get(categoryPageId2); - // if root node reached, check whether it is a shortest path - if (currentNode == root) { - logger.debug("found root"); + // if one of the paths is null => return -1 + if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 + || nodeList2.size() == 0) { + logger.debug("One of the node lists is null or empty!"); + return -1; + } - if (shortestPath.size() != 0) { - if (currentPath.size() < shortestPath.size()) { - logger.debug("setting new shortest path"); - shortestPath.clear(); - shortestPath.addAll(currentPath); + logger.debug(nodeList1.toString()); + logger.debug(nodeList2.toString()); + + // node 1 subsumes node 2 ? + for (int tmpNode2 : nodeList2) { + if (tmpNode2 == categoryPageId1) { + return categoryPageId1; + } } - } else { - logger.debug("initializing shortest path"); - shortestPath.addAll(currentPath); - } - } - // do not expand paths that are longer or equal than the current shortest path - // this is a runtime efficiency optimization! - if (shortestPath.size() != 0 && currentPath.size() >= shortestPath.size()) { - return; - } + // node 2 subsumes node 1 ? + for (int tmpNode1 : nodeList1) { + if (tmpNode1 == categoryPageId2) { + return categoryPageId2; + } + } + // they have a lcs ? + for (int tmpNode1 : nodeList1) { + for (int tmpNode2 : nodeList2) { + if (tmpNode1 == tmpNode2) { + return tmpNode1; + } + } + } - Set incomingEdges = this.graph.incomingEdgesOf(currentNode); + logger.debug("No lcs found."); - // no incoming edges => return path without adding this node - if (incomingEdges == null || incomingEdges.size() == 0) { - logger.debug("found non-root source"); - return; + return -1; } - for (DefaultEdge incomingEdge : incomingEdges) { - int sourceNode = graph.getEdgeSource(incomingEdge); - - if (sourceNode == currentNode) { - logger.warn("Source node equals current node."); - System.exit(1); - } - List savedPath = new LinkedList<>(currentPath); - expandPath(root, sourceNode, currentPath, shortestPath); - currentPath.clear(); - currentPath.addAll(savedPath); + /** + * Gets the lowest common subsumer (LCS) of two nodes. The LCS of two nodes is first node on the + * path to the root, that has both nodes as sons. Nodes that are not in the same connected + * component as the root node are defined to have no LCS. + * + * @param categoryPageId1 + * The pageid of the first category node. + * @param categoryPageId2 + * The pageid of the second category node. + * @return The lowest common subsumer of the two nodes, or null if there is no LCS. + */ + public Category getLCS(int categoryPageId1, int categoryPageId2) throws WikiApiException + { + int lcsid = getLCSId(categoryPageId1, categoryPageId2); + return lcsid > -1 ? wiki.getCategory(getLCSId(categoryPageId1, categoryPageId2)) : null; } - return; - } - - - /** - * Gets the path length between two category nodes - measured in "edges". - * - * @param node1 The first category node. - * @param node2 The second category node. - * @return The number of edges of the path between node1 and node2. 0, if the nodes are identical. -1, if no path exists. - */ - public int getPathLengthInEdges(Category node1, Category node2) { - if (this.graph.containsVertex(node1.getPageId()) && this.graph.containsVertex(node2.getPageId())) { - if (node1.getPageId() == node2.getPageId()) { - return 0; - } - - // get the path from root node to node 1 - GraphPath edgeList = DijkstraShortestPath.findPathBetween(undirectedGraph, node1.getPageId(), node2.getPageId()); - if (edgeList == null) { - return -1; - } else { - return edgeList.getLength(); - } - } - // if the given nodes are not in the category graph, return -1 - else { - return -1; + // /** + // * Gets the lowest common subsumer (LCS) of two nodes. + // * The LCS of two nodes is first node on their paths to the root that is shared between the + // nodes. + // * Nodes that are not in the same connected component as the root node are defined to have no + // LCS. + // * @param rootCategory The root node of the category hierarchy. + // * @param category1 The first category node. + // * @param category2 The second category node. + // * @return The lowest common subsumer of the two nodes, or null if there is no LCS. + // */ + // public Category getLCS(Category rootCategory, Category category1, Category category2) throws + // WikiApiException { + // + // int root = rootCategory.getPageId(); + // int node1 = category1.getPageId(); + // int node2 = category2.getPageId(); + // + //// TODO here might be a problem concerning multiple inheritence in the category graph, if + // there is more than one path of equal length to the root, the method will only find one, but + // may be the other (not found) LCS has a higher information content + // + // logger.debug("root: " + root); + // logger.debug("n1: " + node1); + // logger.debug("n2: " + node2); + // + // // if one of the nodes is not in the same connected component as the root node, we cannot get + // the LCS + // if (!undirectedGraph.containsVertex(node1) || !undirectedGraph.containsVertex(node2)) { + // logger.warn("Cannot get lowest common subsumer because the nodes are not in the same + // connected component."); + // return null; + // } + // + //// TODO due to multiple inheritance there may be a non-shortest path that leads to a lcs below + // the root + //// this should be considered here!! + // // get the path from root node to node 1 + // List edgeList1 = DijkstraShortestPath.findPathBetween(undirectedGraph, node1, + // root); + // + // // get the path from root node to node 2 + // List edgeList2 = DijkstraShortestPath.findPathBetween(undirectedGraph, node2, + // root); + // + // // if one of the nodes is not in the same connected component as the root node, there is no + // path + // // return -1 in this case + // if (edgeList1 == null || edgeList2 == null) { + // return null; + // } + // + // // convert the edge lists to node sets + // List nodeList1 = edgeList2nodeList(edgeList1, root, node1); + // List nodeList2 = edgeList2nodeList(edgeList2, root, node2); + // + // logger.debug(edgeList1); + // logger.debug(edgeList2); + // logger.debug(nodeList1); + // logger.debug(nodeList2); + // + // // node 1 subsumes node 2 ? + // for (int tmpNode2 : nodeList2) { + // if (tmpNode2 == node1) { + // return wiki.__getCategory(node1); + // } + // } + // + // // node 2 subsumes node 1 ? + // for (int tmpNode1 : nodeList1) { + // if (tmpNode1 == node2) { + // return wiki.__getCategory(node2); + // } + // } + // // they have a lcs ? + // for (int tmpNode1 : nodeList1) { + // for (int tmpNode2 : nodeList2) { + // if (tmpNode1 == tmpNode2) { + // return wiki.__getCategory(tmpNode1); + // } + // } + // } + // + // return null; + // } + + // /** + // * Converts an edgeList as returned by the Dijkstra-Shortest-Path algorithm into a list of + // nodes on this path. + // * @param edgeList The list of edges of this path running from the searched node to the root + // node. + // * @return The corresponding list of nodes on the path running from the searched node to the + // root node. + // */ + // private List edgeList2nodeList(List edgeList, int root, int node) + // throws WikiApiException { + // Iterator it = edgeList.iterator(); + // + // List nodeList = new ArrayList(); + // // init with start node + // nodeList.add(node); + // int currentNode = node; + // + // while(it.hasNext()) { + // DefaultEdge currentEdge = it.next(); + // if (graph.getEdgeSource(currentEdge) != currentNode) { + // nodeList.add(graph.getEdgeSource(currentEdge)); + // currentNode = graph.getEdgeSource(currentEdge); + // } + // else if (graph.getEdgeTarget(currentEdge) != currentNode) { + // nodeList.add(graph.getEdgeTarget(currentEdge)); + // currentNode = graph.getEdgeTarget(currentEdge); + // } + // else { + // throw new WikiApiException("Path is broken"); + // } + // } + // return nodeList; + // } + + /** + * Returns the shortest path from node to root as a list of pageIds of the nodes on the path. + * Node and root are included in the path node list. + * + * @param root + * The root node of the graph. + * @param node + * A node of the graph. + * @return The shortest path from node to root as a list of pagIs of the nodes on the path; or + * null if no path exists + * @throws WikiApiException + * Thrown if errors occurred. + */ + private List getPathToRoot(int root, int node) throws WikiApiException + { + List pathToRoot = new LinkedList<>(); + List shortestPath = new ArrayList<>(); + + expandPath(root, node, pathToRoot, shortestPath); + + if (shortestPath.size() == 0) { + return null; + } + else { + return shortestPath; + } } - } - - /** - * Computing the path length in very large graphs like the Wikipedia category graph is very time consuming. - * However, we know that the graph is almost a taxonomy (it contains some cycles that can be removed). - * The path from each category to the root is stored in the rootPathMap. - * We can use this information to speed up computation dramatically. - * However, we might miss some shortest path to a node if there are multiple paths to the root. - *

- * It is very similar to finding the LCS. - * If there is no LCS, than there also is no path. - * If one of the nodes is on the path to the root, than we already know the distance. - * Otherwise the distance can be computed as the sum of the distance of node1 to the LCS + the distance of node2 to the LCS. - * - * @param cat1 The first category. - * @param cat2 The second category. - * @return The number of edges of the path between node1 and node2. 0, if the nodes are identical. -1, if no path exists. - * @throws WikiApiException Thrown if errors occurred. - */ - public int getTaxonomicallyBoundPathLengthInEdges(Category cat1, Category cat2) throws WikiApiException { - int node1 = cat1.getPageId(); - int node2 = cat2.getPageId(); - - // if the given nodes are not in the category graph, return -1 - if (!this.graph.containsVertex(node1) || !this.graph.containsVertex(node2)) { - return -1; + + private void expandPath(int root, int currentNode, List currentPath, + List shortestPath) + { + + // add the current node to the path + currentPath.add(currentNode); + + // if root node reached, check whether it is a shortest path + if (currentNode == root) { + logger.debug("found root"); + + if (shortestPath.size() != 0) { + if (currentPath.size() < shortestPath.size()) { + logger.debug("setting new shortest path"); + shortestPath.clear(); + shortestPath.addAll(currentPath); + } + } + else { + logger.debug("initializing shortest path"); + shortestPath.addAll(currentPath); + } + } + + // do not expand paths that are longer or equal than the current shortest path + // this is a runtime efficiency optimization! + if (shortestPath.size() != 0 && currentPath.size() >= shortestPath.size()) { + return; + } + + Set incomingEdges = this.graph.incomingEdgesOf(currentNode); + + // no incoming edges => return path without adding this node + if (incomingEdges == null || incomingEdges.size() == 0) { + logger.debug("found non-root source"); + return; + } + + for (DefaultEdge incomingEdge : incomingEdges) { + int sourceNode = graph.getEdgeSource(incomingEdge); + + if (sourceNode == currentNode) { + logger.warn("Source node equals current node."); + System.exit(1); + } + List savedPath = new LinkedList<>(currentPath); + expandPath(root, sourceNode, currentPath, shortestPath); + currentPath.clear(); + currentPath.addAll(savedPath); + } + + return; } - if (node1 == node2) { - return 0; + /** + * Gets the path length between two category nodes - measured in "edges". + * + * @param node1 + * The first category node. + * @param node2 + * The second category node. + * @return The number of edges of the path between node1 and node2. 0, if the nodes are + * identical. -1, if no path exists. + */ + public int getPathLengthInEdges(Category node1, Category node2) + { + if (this.graph.containsVertex(node1.getPageId()) + && this.graph.containsVertex(node2.getPageId())) { + if (node1.getPageId() == node2.getPageId()) { + return 0; + } + + // get the path from root node to node 1 + GraphPath edgeList = DijkstraShortestPath + .findPathBetween(undirectedGraph, node1.getPageId(), node2.getPageId()); + if (edgeList == null) { + return -1; + } + else { + return edgeList.getLength(); + } + } + // if the given nodes are not in the category graph, return -1 + else { + return -1; + } } + /** + * Computing the path length in very large graphs like the Wikipedia category graph is very time + * consuming. However, we know that the graph is almost a taxonomy (it contains some cycles that + * can be removed). The path from each category to the root is stored in the rootPathMap. We can + * use this information to speed up computation dramatically. However, we might miss some + * shortest path to a node if there are multiple paths to the root. + *

+ * It is very similar to finding the LCS. If there is no LCS, than there also is no path. If one + * of the nodes is on the path to the root, than we already know the distance. Otherwise the + * distance can be computed as the sum of the distance of node1 to the LCS + the distance of + * node2 to the LCS. + * + * @param cat1 + * The first category. + * @param cat2 + * The second category. + * @return The number of edges of the path between node1 and node2. 0, if the nodes are + * identical. -1, if no path exists. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public int getTaxonomicallyBoundPathLengthInEdges(Category cat1, Category cat2) + throws WikiApiException + { + int node1 = cat1.getPageId(); + int node2 = cat2.getPageId(); + + // if the given nodes are not in the category graph, return -1 + if (!this.graph.containsVertex(node1) || !this.graph.containsVertex(node2)) { + return -1; + } - List nodeList1 = getRootPathMap().get(node1); - List nodeList2 = getRootPathMap().get(node2); + if (node1 == node2) { + return 0; + } - // if one of the paths is null => return null - if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 || nodeList2.size() == 0) { - logger.debug("One of the node lists is null or empty!"); - return -1; - } + List nodeList1 = getRootPathMap().get(node1); + List nodeList2 = getRootPathMap().get(node2); + + // if one of the paths is null => return null + if (nodeList1 == null || nodeList2 == null || nodeList1.size() == 0 + || nodeList2.size() == 0) { + logger.debug("One of the node lists is null or empty!"); + return -1; + } - logger.debug(nodeList1.toString()); - logger.debug(nodeList2.toString()); + logger.debug(nodeList1.toString()); + logger.debug(nodeList2.toString()); - // node1 is on path of node2 to the root - int distance1 = 0; - for (int tmpNode2 : nodeList2) { - if (tmpNode2 == node1) { - return distance1; - } - distance1++; + // node1 is on path of node2 to the root + int distance1 = 0; + for (int tmpNode2 : nodeList2) { + if (tmpNode2 == node1) { + return distance1; + } + distance1++; + } + + // node2 is on path of node1 to the root + int distance2 = 0; + for (int tmpNode1 : nodeList1) { + if (tmpNode1 == node2) { + return distance2; + } + distance2++; + } + + // they have a lcs ? + distance1 = 0; + for (int tmpNode1 : nodeList1) { + distance2 = 0; + for (int tmpNode2 : nodeList2) { + if (tmpNode1 == tmpNode2) { + return distance1 + distance2; + } + distance2++; + } + distance1++; + } + + return -1; } - // node2 is on path of node1 to the root - int distance2 = 0; - for (int tmpNode1 : nodeList1) { - if (tmpNode1 == node2) { - return distance2; - } - distance2++; + public int getTaxonomicallyBoundPathLengthInNodes(Category cat1, Category cat2) + throws WikiApiException + { + int retValue = getTaxonomicallyBoundPathLengthInEdges(cat1, cat2); + + if (retValue == 0) { + return 0; + } + else if (retValue > 0) { + return (--retValue); + } + else if (retValue == -1) { + return -1; + } + else { + throw new WikiApiException("Unknown return value."); + } } - // they have a lcs ? - distance1 = 0; - for (int tmpNode1 : nodeList1) { - distance2 = 0; - for (int tmpNode2 : nodeList2) { - if (tmpNode1 == tmpNode2) { - return distance1 + distance2; - } - distance2++; - } - distance1++; + /** + * Gets the path length between two category nodes - measured in "nodes". + * + * @param node1 + * The first node. + * @param node2 + * The second node. + * @return The number of nodes of the path between node1 and node2. 0, if the nodes are + * identical or neighbors. -1, if no path exists. + */ + public int getPathLengthInNodes(Category node1, Category node2) throws WikiApiException + { + + int retValue = getPathLengthInEdges(node1, node2); + + if (retValue == 0) { + return 0; + } + else if (retValue > 0) { + return (--retValue); + } + else if (retValue == -1) { + return -1; + } + else { + throw new WikiApiException("Unknown return value."); + } } - return -1; - } + /** + * Creates the hyponym map, that maps from nodes to their (recursive) number of hyponyms for + * each node. "recursive" means that the hyponyms of hyponyms are also taken into account. + * + * @throws WikiApiException + */ + private void createHyponymCountMap() throws WikiApiException + { + // do only create hyponymMap, if it was not already computed + if (hyponymCountMap != null) { + return; + } - public int getTaxonomicallyBoundPathLengthInNodes(Category cat1, Category cat2) throws WikiApiException { - int retValue = getTaxonomicallyBoundPathLengthInEdges(cat1, cat2); + File hyponymCountMapSerializedFile = new File( + wiki.getWikipediaId() + "_" + hyponymCountMapFilename); + hyponymCountMap = new HashMap<>(); - if (retValue == 0) { - return 0; - } else if (retValue > 0) { - return (--retValue); - } else if (retValue == -1) { - return -1; - } else { - throw new WikiApiException("Unknown return value."); + if (hyponymCountMapSerializedFile.exists()) { + logger.info("Loading saved hyponymyCountMap ..."); + hyponymCountMap = (Map) this + .deserializeMap(hyponymCountMapSerializedFile); + logger.info("Done loading saved hyponymyCountMap"); + return; + } + + // a queue holding the nodes to process + + // In the category graph a node may have more than one father. + // Thus, we check whether a node was already visited. + // Then, it is not expanded again. + Set visited = new HashSet<>(); + + // initialize the queue with all leaf nodes + Set leafNodes = this.__getLeafNodes(); + List queue = new ArrayList<>(leafNodes); + + logger.info(leafNodes.size() + " leaf nodes."); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int currNode = queue.get(0); + queue.remove(0); + + // logger.info(queue.size()); + + if (visited.contains(currNode)) { + continue; + } + + Set children = __getChildren(currNode); + + int validChildren = 0; + int sumChildHyponyms = 0; + boolean invalid = false; + for (int child : children) { + if (graph.containsVertex(child)) { + if (hyponymCountMap.containsKey(child)) { + sumChildHyponyms += hyponymCountMap.get(child); + validChildren++; + } + else { + invalid = true; + } + } + } + + if (invalid) { + // One of the childs is not in the hyponymCountMap yet + // Re-Enter the node into the queue and continue with next node + queue.add(currNode); + continue; + } + + // mark as visited + visited.add(currNode); + + // number of hyponomys of current node is the number of its own hyponomies and the sum + // of the hyponomies of its children. + int currNodeHyponomyCount = validChildren + sumChildHyponyms; + hyponymCountMap.put(currNode, currNodeHyponomyCount); + + // add parents of current node to queue + for (int parent : __getParents(currNode)) { + if (graph.containsVertex(parent)) { + queue.add(parent); + } + } + + } // while queue not empty + + logger.info(visited.size() + " nodes visited"); + if (visited.size() != graph.vertexSet().size()) { + throw new WikiApiException("Visited only " + visited.size() + " out of " + + graph.vertexSet().size() + " nodes."); + } + if (hyponymCountMap.size() != graph.vertexSet().size()) { + throw new WikiApiException( + "HyponymCountMap does not contain an entry for each node in the graph." + + hyponymCountMap.size() + "/" + graph.vertexSet().size()); + } + + scaleHyponymCountMap(); + logger.info("Computed hyponymCountMap"); + serializeMap(hyponymCountMap, hyponymCountMapSerializedFile); + logger.info("Serialized hyponymCountMap"); } - } - - - /** - * Gets the path length between two category nodes - measured in "nodes". - * - * @param node1 The first node. - * @param node2 The second node. - * @return The number of nodes of the path between node1 and node2. 0, if the nodes are identical or neighbors. -1, if no path exists. - */ - public int getPathLengthInNodes(Category node1, Category node2) throws WikiApiException { - - int retValue = getPathLengthInEdges(node1, node2); - - if (retValue == 0) { - return 0; - } else if (retValue > 0) { - return (--retValue); - } else if (retValue == -1) { - return -1; - } else { - throw new WikiApiException("Unknown return value."); + + /** + * As the categoryGraph is a graph rather than a tree, the hyponymCount for top nodes can be + * greater than the number of nodes in the graph. This is due to the multiple counting of nodes + * having more than one parent. Thus, we have to scale hyponym counts to fall in + * [0,NumberOfNodes]. + * + * @throws WikiApiException + * Thrown if errors occurred. + */ + private void scaleHyponymCountMap() throws WikiApiException + { + for (int key : getHyponymCountMap().keySet()) { + if (getHyponymCountMap().get(key) > graph.vertexSet().size()) { + // TODO scaling function is not optimal (to say the least :) + getHyponymCountMap().put(key, (graph.vertexSet().size() - 1)); + } + } } - } - - /** - * Creates the hyponym map, that maps from nodes to their (recursive) number of hyponyms for each node. - * "recursive" means that the hyponyms of hyponyms are also taken into account. - * - * @throws WikiApiException - */ - private void createHyponymCountMap() throws WikiApiException { - // do only create hyponymMap, if it was not already computed - if (hyponymCountMap != null) { - return; + + /** + * @return The leaf nodes of the graph, i.e. nodes with outdegree = 0. + * @throws WikiApiException + */ + protected Set __getLeafNodes() throws WikiApiException + { + Set leafNodes = new HashSet<>(); + for (int node : graph.vertexSet()) { + if (getOutDegree(node) == 0) { + leafNodes.add(node); + } + } + return leafNodes; } - File hyponymCountMapSerializedFile = new File(wiki.getWikipediaId() + "_" + hyponymCountMapFilename); - hyponymCountMap = new HashMap<>(); + //// The method did not consider that IC has to monotonically decrease from leaves to root node + // /** + // * Intrinsic information content (Seco Etal. 2004) allows to compute information content from + //// the structure of the taxonomy (no corpus needed). + // * IC(n) = 1 - log( hypo(n) + 1) / log(#cat) + // * hypo(n) is the number of hyponyms of a node n + // * #cat is the number of categories in the graph + // * @param numberOfHyponyms + // * @param numberOfCategories + // * @return The intrinsic information content. + // */ + // private double computeIntrinsicInformationContent(int numberOfHyponyms, int + //// numberOfCategories) { + // return (1 - (Math.log(numberOfHyponyms + 1) / Math.log(numberOfCategories)) ); + // } + + /** + * Intrinsic information content (Seco Etal. 2004) allows to compute information content from + * the structure of the taxonomy (no corpus needed). IC(n) = 1 - log( hypo(n) + 1) / log(#cat) + * hypo(n) is the (recursive) number of hyponyms of a node n. Recursive means that the hyponyms + * of hyponyms are also taken into account #cat is the number of categories in the graph + * + * @param category + * The category node for which the intrinsic information content should be returned. + * @return The intrinsic information content for this category node. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public double getIntrinsicInformationContent(Category category) throws WikiApiException + { + int node = category.getPageId(); + + int hyponymCount = getHyponymCountMap().get(node); + int numberOfNodes = this.getNumberOfNodes(); + + if (hyponymCount > numberOfNodes) { + throw new WikiApiException("Something is wrong with the hyponymCountMap. " + + hyponymCount + " hyponyms, but only " + numberOfNodes + " nodes."); + } - if (hyponymCountMapSerializedFile.exists()) { - logger.info("Loading saved hyponymyCountMap ..."); - hyponymCountMap = (Map) this.deserializeMap(hyponymCountMapSerializedFile); - logger.info("Done loading saved hyponymyCountMap"); - return; + logger.debug(category.getTitle().getPlainTitle() + " has # hyponyms: " + hyponymCount); + + double intrinsicIC = -1; + if (hyponymCount >= 0) { + intrinsicIC = (1 - (Math.log(hyponymCount + 1) / Math.log(numberOfNodes))); + } + return intrinsicIC; } - // a queue holding the nodes to process + /** + * Computes the paths from each category node to the root. Computing n paths will take some + * time. Thus, efficient computing is based on the assumption that all subpaths in the shortest + * path to the root, are also shortest paths for the corresponding nodes. Starting with the leaf + * nodes gives the longest initial paths with most subpaths. + * + * @throws WikiApiException + * Thrown if errors occurred. + */ + public void createRootPathMap() throws WikiApiException + { + + // do only create rootPathMap, if it was not already computed + if (rootPathMap != null) { + return; + } - // In the category graph a node may have more than one father. - // Thus, we check whether a node was already visited. - // Then, it is not expanded again. - Set visited = new HashSet<>(); + File rootPathFile = new File(wiki.getWikipediaId() + "_" + this.rootPathMapFilename); - // initialize the queue with all leaf nodes - Set leafNodes = this.__getLeafNodes(); - List queue = new ArrayList<>(leafNodes); + // try to load rootPathMap from precomputed file + if (rootPathFile.exists()) { + logger.info("Loading saved rootPathMap ..."); + rootPathMap = (Map>) deserializeMap(rootPathFile); + logger.info("Done loading saved rootPathMap"); + return; + } - logger.info(leafNodes.size() + " leaf nodes."); + logger.info("Computing rootPathMap"); + rootPathMap = new HashMap<>(); - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int currNode = queue.get(0); - queue.remove(0); + // a queue holding the nodes to process + List queue = new ArrayList<>(); - // logger.info(queue.size()); + // initialize the queue with all leaf nodes + Set leafNodes = this.__getLeafNodes(); + queue.addAll(leafNodes); - if (visited.contains(currNode)) { - continue; - } + logger.info(queue.size() + " leaf nodes."); + fillRootPathMap(queue); - Set children = __getChildren(currNode); + queue.clear(); // queue should be empty now, but clear anyway - int validChildren = 0; - int sumChildHyponyms = 0; - boolean invalid = false; - for (int child : children) { - if (graph.containsVertex(child)) { - if (hyponymCountMap.containsKey(child)) { - sumChildHyponyms += hyponymCountMap.get(child); - validChildren++; - } else { - invalid = true; - } + // add non-leaf nodes that have not been on a shortest, yet + for (Category cat : wiki.getCategories()) { + if (!rootPathMap.containsKey(cat.getPageId())) { + queue.add(cat.getPageId()); + } } - } - - if (invalid) { - // One of the childs is not in the hyponymCountMap yet - // Re-Enter the node into the queue and continue with next node - queue.add(currNode); - continue; - } - // mark as visited - visited.add(currNode); + logger.info(queue.size() + " non leaf nodes not on a shortest leaf-node to root path."); + fillRootPathMap(queue); - // number of hyponomys of current node is the number of its own hyponomies and the sum of the hyponomies of its children. - int currNodeHyponomyCount = validChildren + sumChildHyponyms; - hyponymCountMap.put(currNode, currNodeHyponomyCount); - - // add parents of current node to queue - for (int parent : __getParents(currNode)) { - if (graph.containsVertex(parent)) { - queue.add(parent); + for (Category cat : wiki.getCategories()) { + if (!rootPathMap.containsKey(cat.getPageId())) { + logger.info("no path for " + cat.getPageId()); + } } - } - } // while queue not empty + // from the root path map, we can very easily get the depth + this.depth = getDepthFromRootPathMap(); - logger.info(visited.size() + " nodes visited"); - if (visited.size() != graph.vertexSet().size()) { - throw new WikiApiException("Visited only " + visited.size() + " out of " + graph.vertexSet().size() + " nodes."); - } - if (hyponymCountMap.size() != graph.vertexSet().size()) { - throw new WikiApiException("HyponymCountMap does not contain an entry for each node in the graph." + hyponymCountMap.size() + "/" + graph.vertexSet().size()); - } + logger.info("Setting depth of category graph: " + this.depth); - scaleHyponymCountMap(); - logger.info("Computed hyponymCountMap"); - serializeMap(hyponymCountMap, hyponymCountMapSerializedFile); - logger.info("Serialized hyponymCountMap"); - } - - - /** - * As the categoryGraph is a graph rather than a tree, the hyponymCount for top nodes can be greater than the number of nodes in the graph. - * This is due to the multiple counting of nodes having more than one parent. - * Thus, we have to scale hyponym counts to fall in [0,NumberOfNodes]. - * - * @throws WikiApiException Thrown if errors occurred. - */ - private void scaleHyponymCountMap() throws WikiApiException { - for (int key : getHyponymCountMap().keySet()) { - if (getHyponymCountMap().get(key) > graph.vertexSet().size()) { -// TODO scaling function is not optimal (to say the least :) - getHyponymCountMap().put(key, (graph.vertexSet().size() - 1)); - } - } - } - - /** - * @return The leaf nodes of the graph, i.e. nodes with outdegree = 0. - * @throws WikiApiException - */ - protected Set __getLeafNodes() throws WikiApiException { - Set leafNodes = new HashSet<>(); - for (int node : graph.vertexSet()) { - if (getOutDegree(node) == 0) { - leafNodes.add(node); - } + logger.info("Serializing rootPathMap"); + this.serializeMap(rootPathMap, rootPathFile); } - return leafNodes; - } - -//// The method did not consider that IC has to monotonically decrease from leaves to root node -// /** -// * Intrinsic information content (Seco Etal. 2004) allows to compute information content from the structure of the taxonomy (no corpus needed). -// * IC(n) = 1 - log( hypo(n) + 1) / log(#cat) -// * hypo(n) is the number of hyponyms of a node n -// * #cat is the number of categories in the graph -// * @param numberOfHyponyms -// * @param numberOfCategories -// * @return The intrinsic information content. -// */ -// private double computeIntrinsicInformationContent(int numberOfHyponyms, int numberOfCategories) { -// return (1 - (Math.log(numberOfHyponyms + 1) / Math.log(numberOfCategories)) ); -// } - - /** - * Intrinsic information content (Seco Etal. 2004) allows to compute information content from the structure of the taxonomy (no corpus needed). - * IC(n) = 1 - log( hypo(n) + 1) / log(#cat) - * hypo(n) is the (recursive) number of hyponyms of a node n. Recursive means that the hyponyms of hyponyms are also taken into account - * #cat is the number of categories in the graph - * - * @param category The category node for which the intrinsic information content should be returned. - * @return The intrinsic information content for this category node. - * @throws WikiApiException Thrown if errors occurred. - */ - public double getIntrinsicInformationContent(Category category) throws WikiApiException { - int node = category.getPageId(); - - int hyponymCount = getHyponymCountMap().get(node); - int numberOfNodes = this.getNumberOfNodes(); - - if (hyponymCount > numberOfNodes) { - throw new WikiApiException("Something is wrong with the hyponymCountMap. " + hyponymCount + " hyponyms, but only " + numberOfNodes + " nodes."); + + // TODO the method is only public, because the test deletes the file after creating it - I have + // no idea at the moment how to do it + + /** + * Deleted the root path map file. + * + * @throws WikiApiException + * Thrown if errors occurred. + */ + public void deleteRootPathMap() throws WikiApiException + { + File rootPathFile = new File(this.rootPathMapFilename + "_" + wiki.getLanguage() + "_" + + wiki.getMetaData().getVersion()); + rootPathFile.delete(); } - logger.debug(category.getTitle().getPlainTitle() + " has # hyponyms: " + hyponymCount); + private void fillRootPathMap(List queue) throws WikiApiException + { + int root = wiki.getMetaData().getMainCategory().getPageId(); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int currentNode = queue.get(0); + queue.remove(0); + + logger.debug("Queue size: " + queue.size()); + + // if we have already insert a path for this node => continue with the next + if (getRootPathMap().containsKey(currentNode)) { + continue; + } + + // compute path from current node to root + List nodesOnPath = getPathToRoot(root, currentNode); + + // if there is no path => skip + if (nodesOnPath == null) { + getRootPathMap().put(currentNode, new ArrayList<>()); + continue; + } + + // the first entry should be the current Node, the last entry should be the root + // check whether this assumption is valid + if (nodesOnPath.get(0) != currentNode || // the first node of the list should always be + // the current node + nodesOnPath.get(nodesOnPath.size() - 1) != root) { // the last node of the list + // should always be the root + // node + logger.error("Something is wrong with the path to the root"); + logger.error(nodesOnPath.get(0) + " -- " + currentNode); + logger.error(nodesOnPath.get(nodesOnPath.size() - 1) + " -- " + root); + logger.error("size = {}", nodesOnPath.size()); + System.exit(1); + } + + int i = 0; + for (int nodeOnPath : nodesOnPath) { + // if we have already insert a path for this node => continue with the next + if (getRootPathMap().containsKey(nodeOnPath)) { + continue; + } + // insert path + else { + getRootPathMap().put(nodeOnPath, + new ArrayList<>(nodesOnPath.subList(i, nodesOnPath.size()))); + } + i++; + } + } // while queue not empty + } - double intrinsicIC = -1; - if (hyponymCount >= 0) { - intrinsicIC = (1 - (Math.log(hyponymCount + 1) / Math.log(numberOfNodes))); + /** + * @param pageID + * The pageID of the category. + * @return The indegree of the given category. + */ + protected int getInDegree(int pageID) + { + return graph.inDegreeOf(pageID); } - return intrinsicIC; - } - - /** - * Computes the paths from each category node to the root. - * Computing n paths will take some time. - * Thus, efficient computing is based on the assumption that all subpaths in the shortest path to the root, are also shortest paths for the corresponding nodes. - * Starting with the leaf nodes gives the longest initial paths with most subpaths. - * - * @throws WikiApiException Thrown if errors occurred. - */ - public void createRootPathMap() throws WikiApiException { - - // do only create rootPathMap, if it was not already computed - if (rootPathMap != null) { - return; + + /** + * @param pageID + * The pageID of the category. + * @return The outdegree of the given category. + */ + protected int getOutDegree(int pageID) + { + return graph.outDegreeOf(pageID); } - File rootPathFile = new File(wiki.getWikipediaId() + "_" + this.rootPathMapFilename); + /** + * @param pageID + * The pageID of the category. + * @return A set of child nodes of the given category. + */ + protected Set __getChildren(int pageID) + { + Set outgoingEdges = graph.outgoingEdgesOf(pageID); + Set outLinks = new HashSet<>(); + for (DefaultEdge edge : outgoingEdges) { + outLinks.add(graph.getEdgeTarget(edge)); + } + return outLinks; + } - // try to load rootPathMap from precomputed file - if (rootPathFile.exists()) { - logger.info("Loading saved rootPathMap ..."); - rootPathMap = (Map>) deserializeMap(rootPathFile); - logger.info("Done loading saved rootPathMap"); - return; + /** + * @param pageID + * The pageID of the category. + * @return A set of parent nodes of the given category. + */ + protected Set __getParents(int pageID) + { + Set incomingEdges = graph.incomingEdgesOf(pageID); + Set inLinks = new HashSet<>(); + for (DefaultEdge edge : incomingEdges) { + inLinks.add(graph.getEdgeSource(edge)); + } + return inLinks; } - logger.info("Computing rootPathMap"); - rootPathMap = new HashMap<>(); + /** + * @return Returns the largest connected component as a new graph. If the base graph already is + * connected, it simply returns the whole graph. + */ + public CategoryGraph getLargestConnectedComponent() throws WikiApiException + { + ConnectivityInspector connectInspect = new ConnectivityInspector<>( + graph); + + // if the graph is connected, simply return the whole graph + if (connectInspect.isConnected()) { + return this; + } - // a queue holding the nodes to process - List queue = new ArrayList<>(); + // else, get the largest connected component + List> connectedComponentList = connectInspect.connectedSets(); - // initialize the queue with all leaf nodes - Set leafNodes = this.__getLeafNodes(); - queue.addAll(leafNodes); + logger.info(connectedComponentList.size() + " connected components."); - logger.info(queue.size() + " leaf nodes."); - fillRootPathMap(queue); + int i = 0; + int maxSize = 0; + Set largestComponent = new HashSet<>(); + for (Set connectedComponent : connectedComponentList) { + i++; + if (connectedComponent.size() > maxSize) { + maxSize = connectedComponent.size(); + largestComponent = connectedComponent; + } + } - queue.clear(); // queue should be empty now, but clear anyway + double largestComponentRatio = largestComponent.size() * 100 / this.getNumberOfNodes(); + logger.info("Largest component contains " + largestComponentRatio + "% (" + + largestComponent.size() + "/" + this.getNumberOfNodes() + + ") of the nodes in the graph."); - // add non-leaf nodes that have not been on a shortest, yet - for (Category cat : wiki.getCategories()) { - if (!rootPathMap.containsKey(cat.getPageId())) { - queue.add(cat.getPageId()); - } + return CategoryGraphManager.getCategoryGraph(wiki, largestComponent); } - logger.info(queue.size() + " non leaf nodes not on a shortest leaf-node to root path."); - fillRootPathMap(queue); - - for (Category cat : wiki.getCategories()) { - if (!rootPathMap.containsKey(cat.getPageId())) { - logger.info("no path for " + cat.getPageId()); - } + /** + * Get the number of nodes in the graph. + * + * @return The number of nodes in the graph. + */ + public int getNumberOfNodes() + { + return numberOfNodes; } - // from the root path map, we can very easily get the depth - this.depth = getDepthFromRootPathMap(); - - logger.info("Setting depth of category graph: " + this.depth); - - logger.info("Serializing rootPathMap"); - this.serializeMap(rootPathMap, rootPathFile); - } - - // TODO the method is only public, because the test deletes the file after creating it - I have no idea at the moment how to do it - - /** - * Deleted the root path map file. - * - * @throws WikiApiException Thrown if errors occurred. - */ - public void deleteRootPathMap() throws WikiApiException { - File rootPathFile = new File(this.rootPathMapFilename + "_" + wiki.getLanguage() + "_" + wiki.getMetaData().getVersion()); - rootPathFile.delete(); - } - - private void fillRootPathMap(List queue) throws WikiApiException { - int root = wiki.getMetaData().getMainCategory().getPageId(); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int currentNode = queue.get(0); - queue.remove(0); - - logger.debug("Queue size: " + queue.size()); - - // if we have already insert a path for this node => continue with the next - if (getRootPathMap().containsKey(currentNode)) { - continue; - } - - // compute path from current node to root - List nodesOnPath = getPathToRoot(root, currentNode); - - // if there is no path => skip - if (nodesOnPath == null) { - getRootPathMap().put(currentNode, new ArrayList<>()); - continue; - } - - // the first entry should be the current Node, the last entry should be the root - // check whether this assumption is valid - if (nodesOnPath.get(0) != currentNode || // the first node of the list should always be the current node - nodesOnPath.get(nodesOnPath.size() - 1) != root) { // the last node of the list should always be the root node - logger.error("Something is wrong with the path to the root"); - logger.error(nodesOnPath.get(0) + " -- " + currentNode); - logger.error(nodesOnPath.get(nodesOnPath.size() - 1) + " -- " + root); - logger.error("size = {}", nodesOnPath.size()); - System.exit(1); - } - - int i = 0; - for (int nodeOnPath : nodesOnPath) { - // if we have already insert a path for this node => continue with the next - if (getRootPathMap().containsKey(nodeOnPath)) { - continue; - } - // insert path - else { - getRootPathMap().put(nodeOnPath, new ArrayList<>(nodesOnPath.subList(i, nodesOnPath.size()))); - } - i++; - } - } // while queue not empty - } - - /** - * @param pageID The pageID of the category. - * @return The indegree of the given category. - */ - protected int getInDegree(int pageID) { - return graph.inDegreeOf(pageID); - } - - /** - * @param pageID The pageID of the category. - * @return The outdegree of the given category. - */ - protected int getOutDegree(int pageID) { - return graph.outDegreeOf(pageID); - } - - /** - * @param pageID The pageID of the category. - * @return A set of child nodes of the given category. - */ - protected Set __getChildren(int pageID) { - Set outgoingEdges = graph.outgoingEdgesOf(pageID); - Set outLinks = new HashSet<>(); - for (DefaultEdge edge : outgoingEdges) { - outLinks.add(graph.getEdgeTarget(edge)); + /** + * Get the number of edges in the graph. + * + * @return The number of edges in the graph. + */ + public int getNumberOfEdges() + { + return numberOfEdges; } - return outLinks; - } - - /** - * @param pageID The pageID of the category. - * @return A set of parent nodes of the given category. - */ - protected Set __getParents(int pageID) { - Set incomingEdges = graph.incomingEdgesOf(pageID); - Set inLinks = new HashSet<>(); - for (DefaultEdge edge : incomingEdges) { - inLinks.add(graph.getEdgeSource(edge)); - } - return inLinks; - } - - /** - * @return Returns the largest connected component as a new graph. If the base graph already is connected, it simply returns the whole graph. - */ - public CategoryGraph getLargestConnectedComponent() throws WikiApiException { - ConnectivityInspector connectInspect = new ConnectivityInspector<>(graph); - - // if the graph is connected, simply return the whole graph - if (connectInspect.isConnected()) { - return this; - } - - // else, get the largest connected component - List> connectedComponentList = connectInspect.connectedSets(); - logger.info(connectedComponentList.size() + " connected components."); - - int i = 0; - int maxSize = 0; - Set largestComponent = new HashSet<>(); - for (Set connectedComponent : connectedComponentList) { - i++; - if (connectedComponent.size() > maxSize) { - maxSize = connectedComponent.size(); - largestComponent = connectedComponent; - } + /** + * Computes the average of the path length between all pairs of nodes. The graph is treated as + * an undirected graph. Computing graph parameters requires touching all node pairs. Therefore, + * if one is called the others are computed as well and stored for later retrieval. + * + * @return The average of the shortest path lengths between all pairs of nodes. + */ + public double getAverageShortestPathLength() + { + if (averageShortestPathLength < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); + } + return averageShortestPathLength; } - double largestComponentRatio = largestComponent.size() * 100 / this.getNumberOfNodes(); - logger.info("Largest component contains " + largestComponentRatio + "% (" + largestComponent.size() + "/" + this.getNumberOfNodes() + ") of the nodes in the graph."); - - return CategoryGraphManager.getCategoryGraph(wiki, largestComponent); - } - - /** - * Get the number of nodes in the graph. - * - * @return The number of nodes in the graph. - */ - public int getNumberOfNodes() { - return numberOfNodes; - } - - /** - * Get the number of edges in the graph. - * - * @return The number of edges in the graph. - */ - public int getNumberOfEdges() { - return numberOfEdges; - } - - /** - * Computes the average of the path length between all pairs of nodes. - * The graph is treated as an undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * - * @return The average of the shortest path lengths between all pairs of nodes. - */ - public double getAverageShortestPathLength() { - if (averageShortestPathLength < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return averageShortestPathLength; - } - - /** - * Computes the diameter of the graph (the maximum of the shortest path length between all pairs of nodes) - * The graph is treated as a undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * - * @return The diameter of the graph. - */ - public double getDiameter() { - if (diameter < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return diameter; - } - - /** - * Computes the average degree. The degree of a node is the number of edges edges that it is connected with. - * The graph is treated as an undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * - * @return The average degree of the graph. - */ - public double getAverageDegree() { - if (averageDegree < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return averageDegree; - } - - /** - * Compute the cluster coefficient of the graph (after Watts and Strogatz 1998) - * Cluster coefficient C is defined as the average of C_v over all edges. - * C_v is the fraction of the connections that exist between the neighbor nodes (k_v) of a vertex v and all allowable connections between the neighbors (k_v(k_v -1)/2). - * C_v = 2 * number of connections between / k_v*(k_v -1) - * - * @return The cluster coefficient. - */ - public double getClusterCoefficient() { - if (clusterCoefficient < 0) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return clusterCoefficient; - } - - /** - * Computes the degree distribution. The degree of a node is the number of edges that it is connected with. - * The graph is treated as an undirected graph. - * Computing graph parameters requires touching all node pairs. - * Therefore, if one is called the others are computed as well and stored for later retrieval. - * - * @return A map with the degree distribution of the graph. - */ - public Map getDegreeDistribution() { - if (degreeDistribution == null) { // has not been initialized - logger.debug("Calling setGraphParameters"); - setGraphParameters(); - } - return degreeDistribution; - } - - - /** - * Get the number of connections that exist between the neighbors of a node. - * - * @param node The node under consideration. - * @return The number of connections that exist between the neighbors of node. - */ - private int getNumberOfNeighborConnections(int node) { - int numberOfConnections = 0; - - // get the set of neighbors - Set neighbors = getNeighbors(node); - - if (neighbors.size() > 0) { - // for each pair of neighbors, test if there is a connection - Object[] nodeArray = neighbors.toArray(); - // sort the Array so we can use a simple iteration with two for loops to access all pairs - Arrays.sort(nodeArray); - - for (int i = 0; i < neighbors.size(); i++) { - int outerNode = (Integer) nodeArray[i]; - for (int j = i + 1; j < neighbors.size(); j++) { - int innerNode = (Integer) nodeArray[j]; - // in case of a connection - increade connection counter - // order of the nodes doesn't matter for undirected graphs - if (undirectedGraph.containsEdge(innerNode, outerNode)) { - numberOfConnections++; - } - } - } + /** + * Computes the diameter of the graph (the maximum of the shortest path length between all pairs + * of nodes) The graph is treated as a undirected graph. Computing graph parameters requires + * touching all node pairs. Therefore, if one is called the others are computed as well and + * stored for later retrieval. + * + * @return The diameter of the graph. + */ + public double getDiameter() + { + if (diameter < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); + } + return diameter; } -// logger.info(neighbors.size() + " - " + numberOfConnections); - - return numberOfConnections; - } - - /** - * Get the neighbors of a given node. - * The category graph is treated as an undirected graph. - * - * @param node the reference node. - * @return The set of category nodes that are neighbors of this category. - */ - protected Set getNeighbors(int node) { - - Set neighbors = new HashSet<>(); - Set edges = undirectedGraph.edgesOf(node); - for (DefaultEdge edge : edges) { - if (undirectedGraph.getEdgeSource(edge) != node) { - neighbors.add(undirectedGraph.getEdgeSource(edge)); - } - if (undirectedGraph.getEdgeTarget(edge) != node) { - neighbors.add(undirectedGraph.getEdgeTarget(edge)); - } + /** + * Computes the average degree. The degree of a node is the number of edges edges that it is + * connected with. The graph is treated as an undirected graph. Computing graph parameters + * requires touching all node pairs. Therefore, if one is called the others are computed as well + * and stored for later retrieval. + * + * @return The average degree of the graph. + */ + public double getAverageDegree() + { + if (averageDegree < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); + } + return averageDegree; } - return neighbors; - } + /** + * Compute the cluster coefficient of the graph (after Watts and Strogatz 1998) Cluster + * coefficient C is defined as the average of C_v over all edges. C_v is the fraction of the + * connections that exist between the neighbor nodes (k_v) of a vertex v and all allowable + * connections between the neighbors (k_v(k_v -1)/2). C_v = 2 * number of connections between / + * k_v*(k_v -1) + * + * @return The cluster coefficient. + */ + public double getClusterCoefficient() + { + if (clusterCoefficient < 0) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); + } + return clusterCoefficient; + } - private void updateDegreeDistribution(int nodeDegree) { - if (degreeDistribution.containsKey(nodeDegree)) { - degreeDistribution.put(nodeDegree, (degreeDistribution.get(nodeDegree) + 1)); - } else { - degreeDistribution.put(nodeDegree, 1); + /** + * Computes the degree distribution. The degree of a node is the number of edges that it is + * connected with. The graph is treated as an undirected graph. Computing graph parameters + * requires touching all node pairs. Therefore, if one is called the others are computed as well + * and stored for later retrieval. + * + * @return A map with the degree distribution of the graph. + */ + public Map getDegreeDistribution() + { + if (degreeDistribution == null) { // has not been initialized + logger.debug("Calling setGraphParameters"); + setGraphParameters(); + } + return degreeDistribution; } - } - /** - * Computes and sets the diameter, the average degree and the average shortest path length of the graph. - * Do not call this in the constructor. May run a while. - * It is called in the getters, if parameters are not yet initialized when retrieved. - */ - private void setGraphParameters() { + /** + * Get the number of connections that exist between the neighbors of a node. + * + * @param node + * The node under consideration. + * @return The number of connections that exist between the neighbors of node. + */ + private int getNumberOfNeighborConnections(int node) + { + int numberOfConnections = 0; + + // get the set of neighbors + Set neighbors = getNeighbors(node); + + if (neighbors.size() > 0) { + // for each pair of neighbors, test if there is a connection + Object[] nodeArray = neighbors.toArray(); + // sort the Array so we can use a simple iteration with two for loops to access all + // pairs + Arrays.sort(nodeArray); + + for (int i = 0; i < neighbors.size(); i++) { + int outerNode = (Integer) nodeArray[i]; + for (int j = i + 1; j < neighbors.size(); j++) { + int innerNode = (Integer) nodeArray[j]; + // in case of a connection - increade connection counter + // order of the nodes doesn't matter for undirected graphs + if (undirectedGraph.containsEdge(innerNode, outerNode)) { + numberOfConnections++; + } + } + } + } - // Diameter is the maximum of all shortest path lengths - // Average shortest path length is (as the name says) the average of the shortest path length between all node pairs + // logger.info(neighbors.size() + " - " + numberOfConnections); - double maxPathLength = 0.0; - double shortestPathLengthSum = 0.0; - double degreeSum = 0.0; - double clusterCoefficientSum = 0.0; + return numberOfConnections; + } - // iterate over all node pairs - Set nodes = undirectedGraph.vertexSet(); + /** + * Get the neighbors of a given node. The category graph is treated as an undirected graph. + * + * @param node + * the reference node. + * @return The set of category nodes that are neighbors of this category. + */ + protected Set getNeighbors(int node) + { + + Set neighbors = new HashSet<>(); + Set edges = undirectedGraph.edgesOf(node); + for (DefaultEdge edge : edges) { + if (undirectedGraph.getEdgeSource(edge) != node) { + neighbors.add(undirectedGraph.getEdgeSource(edge)); + } + if (undirectedGraph.getEdgeTarget(edge) != node) { + neighbors.add(undirectedGraph.getEdgeTarget(edge)); + } + } + return neighbors; + } - // a hashset of the nodes which have been the start node of the computation process - // for such nodes all path lengths have beeen already computed - Set wasSource = new HashSet<>(); + private void updateDegreeDistribution(int nodeDegree) + { + if (degreeDistribution.containsKey(nodeDegree)) { + degreeDistribution.put(nodeDegree, (degreeDistribution.get(nodeDegree) + 1)); + } + else { + degreeDistribution.put(nodeDegree, 1); + } + } - int progress = 0; - for (int node : nodes) { + /** + * Computes and sets the diameter, the average degree and the average shortest path length of + * the graph. Do not call this in the constructor. May run a while. It is called in the getters, + * if parameters are not yet initialized when retrieved. + */ + private void setGraphParameters() + { + + // Diameter is the maximum of all shortest path lengths + // Average shortest path length is (as the name says) the average of the shortest path + // length between all node pairs + + double maxPathLength = 0.0; + double shortestPathLengthSum = 0.0; + double degreeSum = 0.0; + double clusterCoefficientSum = 0.0; + + // iterate over all node pairs + Set nodes = undirectedGraph.vertexSet(); + + // a hashset of the nodes which have been the start node of the computation process + // for such nodes all path lengths have beeen already computed + Set wasSource = new HashSet<>(); + + int progress = 0; + for (int node : nodes) { + + progress++; + ApiUtilities.printProgressInfo(progress, nodes.size(), 100, + ApiUtilities.ProgressInfoMode.TEXT, "Getting graph parameters"); + + int nodeDegree = undirectedGraph.degreeOf(node); + degreeSum += nodeDegree; + updateDegreeDistribution(nodeDegree); + + // cluster coefficient of a node is C_v is the fraction of the connections that exist + // between the neighbor nodes (k_v) of a this node and all allowable connections between + // the neighbors (k_v(k_v -1)/2) + // for degrees 0 or 1 there is no cluster coefficient, as there can be no connections + // between neighbors + if (undirectedGraph.degreeOf(node) > 1) { + double numberOfNeighborConnections = getNumberOfNeighborConnections(node); + clusterCoefficientSum += (numberOfNeighborConnections + / (nodeDegree * (nodeDegree - 1))); + } + + // Returns the new shortestPathLengthSum and the new maxPathLength. + // They are returned as an double array for performance reasons. + // I do not want to create an object, as this function is called *very* often + double[] returnValues = computeShortestPathLenghts(node, shortestPathLengthSum, + maxPathLength, wasSource); + shortestPathLengthSum = returnValues[0]; + maxPathLength = returnValues[1]; + + // save the info that the node was already used as the source of path computation + wasSource.add(node); + } - progress++; - ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "Getting graph parameters"); + if (nodes.size() > 1) { + this.averageShortestPathLength = shortestPathLengthSum + / (nodes.size() * (nodes.size() - 1) / 2); // sum of path lengths / (number of + // node pairs) + } + else { + this.averageShortestPathLength = 0; // there is only one node + } + this.diameter = maxPathLength; + this.averageDegree = degreeSum / nodes.size(); + this.clusterCoefficient = clusterCoefficientSum / nodes.size(); + } - int nodeDegree = undirectedGraph.degreeOf(node); - degreeSum += nodeDegree; - updateDegreeDistribution(nodeDegree); + // /** + // * Computes and sets the diameter, the average degree and the average shortest path length of + // the graph. + // * Do not call this in the constructor. May run a while. + // * It is called in the getters, if parameters are not yet initialized when retrieved. + // */ + // public void setGraphParameters_slow() { + // + // // Diameter is the maximum of all shortest path lengths + // // Average shortest path length is (as the name says) the average of the shortest path length + // between all node pairs + // + // double maxDiameter = 0.0; + // double shortestPathLengthSum = 0.0; + // double degreeSum = 0.0; + // double clusterCoefficientSum = 0.0; + // + // // iterate over all node pairs + // Set nodes = undirectedGraph.vertexSet(); + // Object[] nodeArray = nodes.toArray(); + // // sort the Array so we can use a simple iteration with two for loops to access all pairs + // Arrays.sort(nodeArray); + // + // int progress = 0; + // for (int i=0; i 1) { + // clusterCoefficientSum += getNumberOfNeighborConnections(outerNode, undirectedGraph) / + // (undirectedGraph.degreeOf(outerNode) * (undirectedGraph.degreeOf(outerNode)-1)); + // } + // + // for (int j=i+1; j maxDiameter) { + // maxDiameter = pathLength; + // } + // } + // } + // + // this.averageShortestPathLength = shortestPathLengthSum / ( nodes.size() * (nodes.size()-1) / + // 2 ); // sum of path lengths / (number of node pairs) + // this.diameter = maxDiameter; + // this.averageDegree = degreeSum / nodes.size(); + // this.clusterCoefficient = clusterCoefficientSum / nodes.size(); + // } + + /** + * Computes the shortest path from node to all other nodes. Paths to nodes that have already + * been the source of the shortest path computation are omitted (the path was already added to + * the path sum). Updates the sum of shortest path lengths and the diameter of the graph. As the + * JGraphT BreadthFirstIterator does not provide information about the distance to the start + * node in each step, we will use our own BFS implementation. + * + * @param pStartNode + * The start node of the search. + * @param pShortestPathLengthSum + * The sum of the shortes path lengths. + * @param pMaxPathLength + * The maximum path length found so far. + * @param pWasSource + * A set of nodes which have been the start node of the computation process. For such + * nodes all path lengths have beeen already computed. + * @return An array of double values. The first value is the shortestPathLengthSum and the + * second value is the maxPathLength. They are returned as an double array for + * performance reasons. I do not want to create an object, as this function is called + * *very* often. + */ + private double[] computeShortestPathLenghts(int pStartNode, double pShortestPathLengthSum, + double pMaxPathLength, Set pWasSource) + { + + // a set of nodes that have already been expanded -> algorithm should expand nodes + // monotonically and not go back + Set alreadyExpanded = new HashSet<>(); + + // a queue holding the newly discovered nodes with their distance to the start node + List queue = new ArrayList<>(); + + // initialize queue with start node + int[] innerList = new int[2]; + innerList[0] = pStartNode; // the node + innerList[1] = 0; // the distance to the start node + queue.add(innerList); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int[] queueElement = queue.get(0); + int currentNode = queueElement[0]; + int distance = queueElement[1]; + queue.remove(0); + + // if the node was not already expanded + if (!alreadyExpanded.contains(currentNode)) { + // the node gets expanded now + alreadyExpanded.add(currentNode); + + // if the node was a source node in a previous run, we already have added this path + if (!pWasSource.contains(currentNode)) { + // add the distance of this node to shortestPathLengthSum + // check if maxPathLength must be updated + pShortestPathLengthSum += distance; + if (distance > pMaxPathLength) { + pMaxPathLength = distance; + } + } + // even if the node was a source node in a previous run there can be a path to other + // nodes over this node, so go on + + // get the neighbors of the queue element + Set neighbors = getNeighbors(currentNode); + + // iterate over all neighbors + for (int neighbor : neighbors) { + // if the node was not already expanded + if (!alreadyExpanded.contains(neighbor)) { + // add the node to the queue, increase node distance by one + int[] tmpList = new int[2]; + tmpList[0] = neighbor; + tmpList[1] = (distance + 1); + queue.add(tmpList); + } + } + } + } + double[] returnArray = { pShortestPathLengthSum, pMaxPathLength }; + return returnArray; + } + /** + * This parameter is already set in the constructor as it is needed for computation of + * relatedness values. Therefore its computation does not trigger setGraphParameters (it is too + * slow), even if the depth is implicitly determined there, too. + * + * @return The depth of the category graph, i.e. the maximum path length starting with the root + * node. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public double getDepth() throws WikiApiException + { + if (depth < 0) { // has not been initialized + if (rootPathMap != null) { + this.depth = getDepthFromRootPathMap(); + logger.info("Getting depth from RootPathMap: " + this.depth); + + } + else { + depth = computeDepth(); + logger.info("Computing depth of the hierarchy: " + this.depth); + } + } + return depth; + } - // cluster coefficient of a node is C_v is the fraction of the connections that exist between the neighbor nodes (k_v) of a this node and all allowable connections between the neighbors (k_v(k_v -1)/2) - // for degrees 0 or 1 there is no cluster coefficient, as there can be no connections between neighbors - if (undirectedGraph.degreeOf(node) > 1) { - double numberOfNeighborConnections = getNumberOfNeighborConnections(node); - clusterCoefficientSum += (numberOfNeighborConnections / (nodeDegree * (nodeDegree - 1))); - } + /** + * This parameter is already set in the constructor as it is needed for computation of + * relatedness values. Therefore its computation does not trigger setGraphParameters (it is too + * slow), even if the depth is implicitly determined there, too. + * + * @return The depth of the category graph, i.e. the maximum path length starting with the root + * node. + * @throws WikiApiException + * Thrown if errors occurred. + */ + private double getDepthFromRootPathMap() throws WikiApiException + { + int max = 0; + for (List path : getRootPathMap().values()) { + if (path.size() > max) { + max = path.size(); + } + } - // Returns the new shortestPathLengthSum and the new maxPathLength. - // They are returned as an double array for performance reasons. - // I do not want to create an object, as this function is called *very* often - double[] returnValues = computeShortestPathLenghts(node, shortestPathLengthSum, maxPathLength, wasSource); - shortestPathLengthSum = returnValues[0]; - maxPathLength = returnValues[1]; + max = max - 1; // depth is measured in nodes, not edges - // save the info that the node was already used as the source of path computation - wasSource.add(node); + if (max < 0) { + return 0; + } + else { + return max; + } } - if (nodes.size() > 1) { - this.averageShortestPathLength = shortestPathLengthSum / (nodes.size() * (nodes.size() - 1) / 2); // sum of path lengths / (number of node pairs) - } else { - this.averageShortestPathLength = 0; // there is only one node - } - this.diameter = maxPathLength; - this.averageDegree = degreeSum / nodes.size(); - this.clusterCoefficient = clusterCoefficientSum / nodes.size(); - } - -// /** -// * Computes and sets the diameter, the average degree and the average shortest path length of the graph. -// * Do not call this in the constructor. May run a while. -// * It is called in the getters, if parameters are not yet initialized when retrieved. -// */ -// public void setGraphParameters_slow() { -// -// // Diameter is the maximum of all shortest path lengths -// // Average shortest path length is (as the name says) the average of the shortest path length between all node pairs -// -// double maxDiameter = 0.0; -// double shortestPathLengthSum = 0.0; -// double degreeSum = 0.0; -// double clusterCoefficientSum = 0.0; -// -// // iterate over all node pairs -// Set nodes = undirectedGraph.vertexSet(); -// Object[] nodeArray = nodes.toArray(); -// // sort the Array so we can use a simple iteration with two for loops to access all pairs -// Arrays.sort(nodeArray); -// -// int progress = 0; -// for (int i=0; i 1) { -// clusterCoefficientSum += getNumberOfNeighborConnections(outerNode, undirectedGraph) / (undirectedGraph.degreeOf(outerNode) * (undirectedGraph.degreeOf(outerNode)-1)); -// } -// -// for (int j=i+1; j maxDiameter) { -// maxDiameter = pathLength; -// } -// } -// } -// -// this.averageShortestPathLength = shortestPathLengthSum / ( nodes.size() * (nodes.size()-1) / 2 ); // sum of path lengths / (number of node pairs) -// this.diameter = maxDiameter; -// this.averageDegree = degreeSum / nodes.size(); -// this.clusterCoefficient = clusterCoefficientSum / nodes.size(); -// } - - /** - * Computes the shortest path from node to all other nodes. - * Paths to nodes that have already been the source of the shortest path computation - * are omitted (the path was already added to the path sum). - * Updates the sum of shortest path lengths and the diameter of the graph. - * As the JGraphT BreadthFirstIterator does not provide information about - * the distance to the start node in each step, we will use our own BFS implementation. - * - * @param pStartNode The start node of the search. - * @param pShortestPathLengthSum The sum of the shortes path lengths. - * @param pMaxPathLength The maximum path length found so far. - * @param pWasSource A set of nodes which have been the start node of the computation process. For such nodes all path lengths have beeen already computed. - * @return An array of double values. - * The first value is the shortestPathLengthSum and the second value is the maxPathLength. - * They are returned as an double array for performance reasons. - * I do not want to create an object, as this function is called *very* often. - */ - private double[] computeShortestPathLenghts(int pStartNode, double pShortestPathLengthSum, double pMaxPathLength, Set pWasSource) { - - // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back - Set alreadyExpanded = new HashSet<>(); - - // a queue holding the newly discovered nodes with their distance to the start node - List queue = new ArrayList<>(); - - // initialize queue with start node - int[] innerList = new int[2]; - innerList[0] = pStartNode; // the node - innerList[1] = 0; // the distance to the start node - queue.add(innerList); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int[] queueElement = queue.get(0); - int currentNode = queueElement[0]; - int distance = queueElement[1]; - queue.remove(0); - - // if the node was not already expanded - if (!alreadyExpanded.contains(currentNode)) { - // the node gets expanded now - alreadyExpanded.add(currentNode); - - // if the node was a source node in a previous run, we already have added this path - if (!pWasSource.contains(currentNode)) { - // add the distance of this node to shortestPathLengthSum - // check if maxPathLength must be updated - pShortestPathLengthSum += distance; - if (distance > pMaxPathLength) { - pMaxPathLength = distance; - } - } - // even if the node was a source node in a previous run there can be a path to other nodes over this node, so go on - - // get the neighbors of the queue element - Set neighbors = getNeighbors(currentNode); - - // iterate over all neighbors - for (int neighbor : neighbors) { - // if the node was not already expanded - if (!alreadyExpanded.contains(neighbor)) { - // add the node to the queue, increase node distance by one - int[] tmpList = new int[2]; - tmpList[0] = neighbor; - tmpList[1] = (distance + 1); - queue.add(tmpList); - } - } - } + /** + * Computes the depth of the category graph, i.e. the maximum path length starting with the root + * node. + * + * @return The depth of the hierarchy. + * @throws WikiApiException + * Thrown if errors occurred. + */ + private double computeDepth() throws WikiApiException + { + Category root = wiki.getMetaData().getMainCategory(); + if (root == null) { + logger.error( + "There is no root node for this wiki. Check the parameter that provides the name of the root node."); + return 0.0; + } + // test whether the root category is in this graph + if (!graph.containsVertex(root.getPageId())) { + logger.error( + "The root node is not part of this graph. Cannot compute depth of this graph. Setting depth to 0.0"); + return 0.0; + } + double maxPathLength = 0.0; + double[] returnValues = computeShortestPathLenghts(root.getPageId(), 0.0, maxPathLength, + new HashSet<>()); + maxPathLength = returnValues[1]; + return maxPathLength; } - double[] returnArray = {pShortestPathLengthSum, pMaxPathLength}; - return returnArray; - } - - /** - * This parameter is already set in the constructor as it is needed for computation of relatedness values. - * Therefore its computation does not trigger setGraphParameters (it is too slow), even if the depth is implicitly determined there, too. - * - * @return The depth of the category graph, i.e. the maximum path length starting with the root node. - * @throws WikiApiException Thrown if errors occurred. - */ - public double getDepth() throws WikiApiException { - if (depth < 0) { // has not been initialized - if (rootPathMap != null) { - this.depth = getDepthFromRootPathMap(); - logger.info("Getting depth from RootPathMap: " + this.depth); - } else { - depth = computeDepth(); - logger.info("Computing depth of the hierarchy: " + this.depth); - } - } - return depth; - } - - /** - * This parameter is already set in the constructor as it is needed for computation of relatedness values. - * Therefore its computation does not trigger setGraphParameters (it is too slow), even if the depth is implicitly determined there, too. - * - * @return The depth of the category graph, i.e. the maximum path length starting with the root node. - * @throws WikiApiException Thrown if errors occurred. - */ - private double getDepthFromRootPathMap() throws WikiApiException { - int max = 0; - for (List path : getRootPathMap().values()) { - if (path.size() > max) { - max = path.size(); - } + public String getGraphInfo() + { + StringBuffer sb = new StringBuffer(1000); + Map degreeDistribution = getDegreeDistribution(); + + sb.append("Number of Nodes: " + getNumberOfNodes() + LF); + sb.append("Number of Edges: " + getNumberOfEdges() + LF); + sb.append("Avg. path length: " + getAverageShortestPathLength() + LF); + sb.append("Diameter: " + getDiameter() + LF); + sb.append("Average degree: " + getAverageDegree() + LF); + sb.append("Cluster coefficient: " + getClusterCoefficient() + LF); + sb.append( + "Degree distribution: " + CommonUtilities.getMapContents(degreeDistribution) + LF); + + return sb.toString(); } - max = max - 1; // depth is measured in nodes, not edges - - if (max < 0) { - return 0; - } else { - return max; - } - } - - /** - * Computes the depth of the category graph, i.e. the maximum path length starting with the root node. - * - * @return The depth of the hierarchy. - * @throws WikiApiException Thrown if errors occurred. - */ - private double computeDepth() throws WikiApiException { - Category root = wiki.getMetaData().getMainCategory(); - if (root == null) { - logger.error("There is no root node for this wiki. Check the parameter that provides the name of the root node."); - return 0.0; + /** + * @return Returns the graph. + */ + public DefaultDirectedGraph getGraph() + { + return graph; } - // test whether the root category is in this graph - if (!graph.containsVertex(root.getPageId())) { - logger.error("The root node is not part of this graph. Cannot compute depth of this graph. Setting depth to 0.0"); - return 0.0; + + public AsUndirectedGraph getUndirectedGraph() + { + return undirectedGraph; } - double maxPathLength = 0.0; - double[] returnValues = computeShortestPathLenghts(root.getPageId(), 0.0, maxPathLength, new HashSet<>()); - maxPathLength = returnValues[1]; - return maxPathLength; - } - - public String getGraphInfo() { - StringBuffer sb = new StringBuffer(1000); - Map degreeDistribution = getDegreeDistribution(); - - sb.append("Number of Nodes: " + getNumberOfNodes() + LF); - sb.append("Number of Edges: " + getNumberOfEdges() + LF); - sb.append("Avg. path length: " + getAverageShortestPathLength() + LF); - sb.append("Diameter: " + getDiameter() + LF); - sb.append("Average degree: " + getAverageDegree() + LF); - sb.append("Cluster coefficient: " + getClusterCoefficient() + LF); - sb.append("Degree distribution: " + CommonUtilities.getMapContents(degreeDistribution) + LF); - - return sb.toString(); - } - - /** - * @return Returns the graph. - */ - public DefaultDirectedGraph getGraph() { - return graph; - } - - public AsUndirectedGraph getUndirectedGraph() { - return undirectedGraph; - } - - public Map getHyponymCountMap() throws WikiApiException { - if (hyponymCountMap == null) { - createHyponymCountMap(); + + public Map getHyponymCountMap() throws WikiApiException + { + if (hyponymCountMap == null) { + createHyponymCountMap(); + } + return this.hyponymCountMap; } - return this.hyponymCountMap; - } - public Map> getRootPathMap() throws WikiApiException { - if (rootPathMap == null) { - createRootPathMap(); + public Map> getRootPathMap() throws WikiApiException + { + if (rootPathMap == null) { + createRootPathMap(); + } + return this.rootPathMap; } - return this.rootPathMap; - } - - /** - * Serialize a Map. - * - * @param map The map to serialize. - * @param file The file for saving the map. - */ - private void serializeMap(Map map, File file) { - try (ObjectOutputStream os = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(file)))) { - os.writeObject(map); - } catch (Exception e) { - logger.error(e.getLocalizedMessage(), e); + + /** + * Serialize a Map. + * + * @param map + * The map to serialize. + * @param file + * The file for saving the map. + */ + private void serializeMap(Map map, File file) + { + try (ObjectOutputStream os = new ObjectOutputStream( + new BufferedOutputStream(new FileOutputStream(file)))) { + os.writeObject(map); + } + catch (Exception e) { + logger.error(e.getLocalizedMessage(), e); + } } - } - - /** - * Deserialize a map - * - * @param file The file with the map. - */ - private Map deserializeMap(File file) { - Map map; - try (ObjectInputStream is = new ObjectInputStream(new BufferedInputStream(new FileInputStream(file)))) { - map = (Map) is.readObject(); - } catch (Exception e) { - logger.error(e.getLocalizedMessage(), e); - return null; + + /** + * Deserialize a map + * + * @param file + * The file with the map. + */ + private Map deserializeMap(File file) + { + Map map; + try (ObjectInputStream is = new ObjectInputStream( + new BufferedInputStream(new FileInputStream(file)))) { + map = (Map) is.readObject(); + } + catch (Exception e) { + logger.error(e.getLocalizedMessage(), e); + return null; + } + return map; } - return map; - } - - /** - * Serializes the graph to the given destination. - * - * @param destination The destination to which should be saved. - * @throws WikiApiException Thrown if errors occurred. - */ - // TODO should be refactored a bit. - public void saveGraph(String destination) throws WikiApiException { - try { - GraphSerialization.saveGraph(graph, destination); - } catch (IOException e) { - throw new WikiApiException(e); + + /** + * Serializes the graph to the given destination. + * + * @param destination + * The destination to which should be saved. + * @throws WikiApiException + * Thrown if errors occurred. + */ + // TODO should be refactored a bit. + public void saveGraph(String destination) throws WikiApiException + { + try { + GraphSerialization.saveGraph(graph, destination); + } + catch (IOException e) { + throw new WikiApiException(e); + } } - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java index 81e04c34..4d4970e9 100755 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryGraphManager.java @@ -32,94 +32,113 @@ // TODO category graph manager implements real singletons for category graphs // up to now, it is only used in LSR // There should be no way to construct a category graph that circumvents the manager. -public class CategoryGraphManager { +public class CategoryGraphManager +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - private static Map catGraphMap; + private static Map catGraphMap; - private final static String catGraphSerializationFilename = "catGraphSer"; + private final static String catGraphSerializationFilename = "catGraphSer"; - public static CategoryGraph getCategoryGraph(Wikipedia wiki) throws WikiApiException { - return getCategoryGraph(wiki, null, true); - } - - public static CategoryGraph getCategoryGraph(Wikipedia wiki, boolean serialize) throws WikiApiException { - return getCategoryGraph(wiki, null, serialize); - } - - public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds) throws WikiApiException { - return getCategoryGraph(wiki, pageIds, true); - } - - public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds, boolean serialize) throws WikiApiException { - if (catGraphMap == null) { - catGraphMap = new HashMap<>(); + public static CategoryGraph getCategoryGraph(Wikipedia wiki) throws WikiApiException + { + return getCategoryGraph(wiki, null, true); } - String wikiID = wiki.getWikipediaId(); - if (catGraphMap.containsKey(wikiID)) { - return catGraphMap.get(wikiID); + public static CategoryGraph getCategoryGraph(Wikipedia wiki, boolean serialize) + throws WikiApiException + { + return getCategoryGraph(wiki, null, serialize); } - String size = ""; - if (pageIds != null) { - size = Integer.valueOf(pageIds.size()).toString(); + public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds) + throws WikiApiException + { + return getCategoryGraph(wiki, pageIds, true); } - CategoryGraph catGraph; - if (serialize) { - catGraph = tryToLoadCategoryGraph(wiki, wikiID, size); - if (catGraph != null) { + public static CategoryGraph getCategoryGraph(Wikipedia wiki, Set pageIds, + boolean serialize) + throws WikiApiException + { + if (catGraphMap == null) { + catGraphMap = new HashMap<>(); + } + + String wikiID = wiki.getWikipediaId(); + if (catGraphMap.containsKey(wikiID)) { + return catGraphMap.get(wikiID); + } + + String size = ""; + if (pageIds != null) { + size = Integer.valueOf(pageIds.size()).toString(); + } + + CategoryGraph catGraph; + if (serialize) { + catGraph = tryToLoadCategoryGraph(wiki, wikiID, size); + if (catGraph != null) { + catGraphMap.put(wikiID, catGraph); + return catGraph; + } + } + + // could not be loaded (= no serialized category graph was written so far) => create it + if (pageIds != null) { + catGraph = new CategoryGraph(wiki, pageIds); + } + else { + catGraph = new CategoryGraph(wiki); + } + catGraphMap.put(wikiID, catGraph); - return catGraph; - } - } + if (serialize) { + saveCategoryGraph(catGraph, wikiID, size); + } - // could not be loaded (= no serialized category graph was written so far) => create it - if (pageIds != null) { - catGraph = new CategoryGraph(wiki, pageIds); - } else { - catGraph = new CategoryGraph(wiki); + return catGraph; } - catGraphMap.put(wikiID, catGraph); - - if (serialize) { - saveCategoryGraph(catGraph, wikiID, size); + private static CategoryGraph tryToLoadCategoryGraph(Wikipedia wiki, String wikiId, String size) + throws WikiApiException + { + + String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); + File defaulSerializedGraphFile = new File(defaultSerializedGraphLocation); + if (defaulSerializedGraphFile.exists()) { + try { + logger.info("Loading category graph from " + defaultSerializedGraphLocation); + return new CategoryGraph(wiki, + GraphSerialization.loadGraph(defaultSerializedGraphLocation)); + } + catch (IOException | ClassNotFoundException e) { + throw new WikiApiException(e); + } + } + else { + return null; + } } - return catGraph; - } - - private static CategoryGraph tryToLoadCategoryGraph(Wikipedia wiki, String wikiId, String size) throws WikiApiException { - - String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); - File defaulSerializedGraphFile = new File(defaultSerializedGraphLocation); - if (defaulSerializedGraphFile.exists()) { - try { - logger.info("Loading category graph from " + defaultSerializedGraphLocation); - return new CategoryGraph(wiki, GraphSerialization.loadGraph(defaultSerializedGraphLocation)); - } catch (IOException | ClassNotFoundException e) { - throw new WikiApiException(e); - } - } else { - return null; + private static void saveCategoryGraph(CategoryGraph catGraph, String wikiId, String size) + throws WikiApiException + { + String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); + try { + logger.info("Saving category graph to " + defaultSerializedGraphLocation); + GraphSerialization.saveGraph(catGraph.getGraph(), defaultSerializedGraphLocation); + } + catch (IOException e) { + throw new WikiApiException(e); + } } - } - - private static void saveCategoryGraph(CategoryGraph catGraph, String wikiId, String size) throws WikiApiException { - String defaultSerializedGraphLocation = getCategoryGraphSerializationFileName(wikiId, size); - try { - logger.info("Saving category graph to " + defaultSerializedGraphLocation); - GraphSerialization.saveGraph(catGraph.getGraph(), defaultSerializedGraphLocation); - } catch (IOException e) { - throw new WikiApiException(e); - } - } - private static String getCategoryGraphSerializationFileName(String wikiId, String size) { - return catGraphSerializationFilename + "_" + wikiId + size; - } + private static String getCategoryGraphSerializationFileName(String wikiId, String size) + { + return catGraphSerializationFilename + "_" + wikiId + size; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java index 220d618c..aec0c532 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterable.java @@ -22,32 +22,33 @@ /** * An {@link Iterable} over {@link Category} objects. */ -public class CategoryIterable implements Iterable { - - private final Wikipedia wiki; - - /* - * The size of the page buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single article. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 500. - */ - private int bufferSize = 500; - - public CategoryIterable(Wikipedia wiki) { - this.wiki = wiki; - } - - public CategoryIterable(Wikipedia wiki, int bufferSize) { - this.wiki = wiki; - this.bufferSize = bufferSize; - } - - @Override - public Iterator iterator() { - return new CategoryIterator(wiki, bufferSize); - } +public class CategoryIterable + implements Iterable +{ + + private final Wikipedia wiki; + + /* + * The size of the page buffer. With bufferSize = 1, a database connection is needed for + * retrieving a single article. Higher bufferSize gives better performance, but needs memory. + * Initialize it with 500. + */ + private int bufferSize = 500; + + public CategoryIterable(Wikipedia wiki) + { + this.wiki = wiki; + } + + public CategoryIterable(Wikipedia wiki, int bufferSize) + { + this.wiki = wiki; + this.bufferSize = bufferSize; + } + + @Override + public Iterator iterator() + { + return new CategoryIterator(wiki, bufferSize); + } } - - - diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java index c4c92fb6..bede8d44 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryIterator.java @@ -30,131 +30,148 @@ /** * An {@link Iterator} over {@link Category} objects. */ -public class CategoryIterator implements Iterator { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final CategoryBuffer buffer; - - public CategoryIterator(Wikipedia wiki, int bufferSize) { - buffer = new CategoryBuffer(bufferSize, wiki); - } - - @Override - public boolean hasNext() { - return buffer.hasNext(); - } - - @Override - public Category next() { - return buffer.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * Buffers categories in a list. - */ - static class CategoryBuffer { - - private final Wikipedia wiki; - - private final List buffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private int dataOffset; // the overall offset in the data - - public CategoryBuffer(int bufferSize, Wikipedia wiki) { - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.buffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.dataOffset = 0; - //TODO test whether this works when zero pages are retrieved +public class CategoryIterator + implements Iterator +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private final CategoryBuffer buffer; + + public CategoryIterator(Wikipedia wiki, int bufferSize) + { + buffer = new CategoryBuffer(bufferSize, wiki); } - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext() { - if (bufferOffset < bufferFillSize) { - return true; - } else { - return this.fillBuffer(); - } + @Override + public boolean hasNext() + { + return buffer.hasNext(); } - /** - * @return The next Category or null if no more categories are available. - */ - public Category next() { - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } else { - // if it cannot be filled => return null - return null; - } + @Override + public Category next() + { + return buffer.next(); } - private Category getBufferElement() { - Category cat = buffer.get(bufferOffset); - bufferOffset++; - dataOffset++; - return cat; + @Override + public void remove() + { + throw new UnsupportedOperationException(); } - private boolean fillBuffer() { - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - final String sql = "SELECT c FROM Category c"; - List returnValues = - session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Category.class) - .setFirstResult(dataOffset) - .setMaxResults(maxBufferSize) - .setFetchSize(maxBufferSize) - .list(); - session.getTransaction().commit(); - - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - Category apiCategory; - for (org.dkpro.jwpl.api.hibernate.Category o : returnValues) { - if (o == null) { - return false; - } else { - long id = o.getId(); - try { - apiCategory = new Category(this.wiki, id); - buffer.add(apiCategory); - } catch (WikiApiException e) { - logger.error("Page with hibernateID {} not found.", id, e); - } + /** + * Buffers categories in a list. + */ + static class CategoryBuffer + { + + private final Wikipedia wiki; + + private final List buffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the + // database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private int dataOffset; // the overall offset in the data + + public CategoryBuffer(int bufferSize, Wikipedia wiki) + { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.buffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.dataOffset = 0; + // TODO test whether this works when zero pages are retrieved } - } - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } else { - return false; - } - } - } + /** + * If there are elements in the buffer left, then return true. If the end of the filled + * buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() + { + if (bufferOffset < bufferFillSize) { + return true; + } + else { + return this.fillBuffer(); + } + } + + /** + * @return The next Category or null if no more categories are available. + */ + public Category next() + { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } + else { + // if it cannot be filled => return null + return null; + } + } + + private Category getBufferElement() + { + Category cat = buffer.get(bufferOffset); + bufferOffset++; + dataOffset++; + return cat; + } + + private boolean fillBuffer() + { + + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + final String sql = "SELECT c FROM Category c"; + List returnValues = session + .createQuery(sql, org.dkpro.jwpl.api.hibernate.Category.class) + .setFirstResult(dataOffset).setMaxResults(maxBufferSize) + .setFetchSize(maxBufferSize).list(); + session.getTransaction().commit(); + + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + Category apiCategory; + for (org.dkpro.jwpl.api.hibernate.Category o : returnValues) { + if (o == null) { + return false; + } + else { + long id = o.getId(); + try { + apiCategory = new Category(this.wiki, id); + buffer.add(apiCategory); + } + catch (WikiApiException e) { + logger.error("Page with hibernateID {} not found.", id, e); + } + } + } + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } + else { + return false; + } + } + + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java index 6f2adf37..fd8bdef0 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CategoryTitleComparator.java @@ -24,17 +24,21 @@ /** * Compares two {@link Category categories} based on the lexicographic ordering of their titles. */ -public class CategoryTitleComparator implements Comparator { +public class CategoryTitleComparator + implements Comparator +{ - @Override - public int compare(Category o1, Category o2) { + @Override + public int compare(Category o1, Category o2) + { - int retVal = 0; - try { - retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); - } catch (WikiTitleParsingException e) { - e.printStackTrace(); + int retVal = 0; + try { + retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); + } + catch (WikiTitleParsingException e) { + e.printStackTrace(); + } + return retVal; } - return retVal; - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java index a60eff34..11e96031 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/CycleHandler.java @@ -30,98 +30,114 @@ /** * Methods for handling cycles in the category graph. */ -public class CycleHandler { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - final Wikipedia wiki; - final CategoryGraph categoryGraph; - - private enum Color {white, grey, black} - - private Map colorMap; - - /** - * Creates a cycle handler object. - * - * @param wiki The {@link Wikipedia} object to use. - * @param categoryGraph The category graph in which cycles should be handled. - */ - public CycleHandler(Wikipedia wiki, CategoryGraph categoryGraph) { - this.wiki = wiki; - this.categoryGraph = categoryGraph; - } - - /** - * The JGraphT cycle detection seems not to find all cycles. Thus, I wrote my own cycle detection. - * It is a colored DFS and should find all (viscious :) cycles. - * - * @return True, if the graph contains a cycle. - * @throws WikiApiException Thrown if errors occurred. - */ - public boolean containsCycle() throws WikiApiException { - DefaultEdge edge = findCycle(); - if (edge != null) { - Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); - Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); - - logger.info("Cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); - return true; - } else { - return false; +public class CycleHandler +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + final Wikipedia wiki; + final CategoryGraph categoryGraph; + + private enum Color + { + white, grey, black } - } - - /** - * Removes cycles from the graph that was used to construct the cycle handler. - * - * @throws WikiApiException Thrown if errors occurred. - */ - public void removeCycles() throws WikiApiException { - DefaultEdge edge; - while ((edge = findCycle()) != null) { - Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); - Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); - - logger.info("Removing cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); - - categoryGraph.getGraph().removeEdge(edge); + + private Map colorMap; + + /** + * Creates a cycle handler object. + * + * @param wiki + * The {@link Wikipedia} object to use. + * @param categoryGraph + * The category graph in which cycles should be handled. + */ + public CycleHandler(Wikipedia wiki, CategoryGraph categoryGraph) + { + this.wiki = wiki; + this.categoryGraph = categoryGraph; + } + + /** + * The JGraphT cycle detection seems not to find all cycles. Thus, I wrote my own cycle + * detection. It is a colored DFS and should find all (viscious :) cycles. + * + * @return True, if the graph contains a cycle. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public boolean containsCycle() throws WikiApiException + { + DefaultEdge edge = findCycle(); + if (edge != null) { + Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); + Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); + + logger.info("Cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); + return true; + } + else { + return false; + } } - } - private DefaultEdge findCycle() { - colorMap = new HashMap<>(); - // initialize all nodes with white - for (int node : categoryGraph.getGraph().vertexSet()) { - colorMap.put(node, Color.white); + /** + * Removes cycles from the graph that was used to construct the cycle handler. + * + * @throws WikiApiException + * Thrown if errors occurred. + */ + public void removeCycles() throws WikiApiException + { + DefaultEdge edge; + while ((edge = findCycle()) != null) { + Category sourceCat = wiki.getCategory(categoryGraph.getGraph().getEdgeSource(edge)); + Category targetCat = wiki.getCategory(categoryGraph.getGraph().getEdgeTarget(edge)); + + logger.info("Removing cycle: " + sourceCat.getTitle() + " - " + targetCat.getTitle()); + + categoryGraph.getGraph().removeEdge(edge); + } } - for (int node : categoryGraph.getGraph().vertexSet()) { - if (colorMap.get(node).equals(Color.white)) { - DefaultEdge e = visit(node); - if (e != null) { - return e; + private DefaultEdge findCycle() + { + colorMap = new HashMap<>(); + // initialize all nodes with white + for (int node : categoryGraph.getGraph().vertexSet()) { + colorMap.put(node, Color.white); } - } + + for (int node : categoryGraph.getGraph().vertexSet()) { + if (colorMap.get(node).equals(Color.white)) { + DefaultEdge e = visit(node); + if (e != null) { + return e; + } + } + } + return null; } - return null; - } - - private DefaultEdge visit(int node) { - colorMap.put(node, Color.grey); - Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(node); - for (DefaultEdge edge : outgoingEdges) { - int outNode = categoryGraph.getGraph().getEdgeTarget(edge); - if (colorMap.get(outNode).equals(Color.grey)) { - return edge; - } else if (colorMap.get(outNode).equals(Color.white)) { - DefaultEdge e = visit(outNode); - if (e != null) { - return e; + + private DefaultEdge visit(int node) + { + colorMap.put(node, Color.grey); + Set outgoingEdges = categoryGraph.getGraph().outgoingEdgesOf(node); + for (DefaultEdge edge : outgoingEdges) { + int outNode = categoryGraph.getGraph().getEdgeTarget(edge); + if (colorMap.get(outNode).equals(Color.grey)) { + return edge; + } + else if (colorMap.get(outNode).equals(Color.white)) { + DefaultEdge e = visit(outNode); + if (e != null) { + return e; + } + } } - } + colorMap.put(node, Color.black); + return null; } - colorMap.put(node, Color.black); - return null; - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java index 2e7d01f8..345bb29c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/DatabaseConfiguration.java @@ -18,166 +18,208 @@ package org.dkpro.jwpl.api; /** - * A {@link DatabaseConfiguration} is used to establish a database connection and set various parameters. + * A {@link DatabaseConfiguration} is used to establish a database connection and set various + * parameters. */ -public class DatabaseConfiguration { - - private String host; - private String database; - private String user; - private String password; - private WikiConstants.Language language; - private String jdbcURL; - private String databaseDriver; - - public DatabaseConfiguration() { - } - - /** - * A constructor for MySQL backends, i.e. the default production setting. - * - * @param host The hostname the machine the database is hosted on. - * @param database The name of the database to connect to. - * @param user The username as part of the credentials used for authentication. - * @param password The password as part of the credentials used for authentication. - * @param language The {@link WikiConstants.Language} used for the underlying connection. - */ - public DatabaseConfiguration(String host, String database, String user, String password, WikiConstants.Language language) { - - this("com.mysql.jdbc.Driver", "jdbc:mysql://" + host + "/" + database, - host, database, user, password, language); - } - - /** - * A constructor for an explicit DBMS specific configuration. - * - * @param databaseDriver The fully qualified name of the JDBC driver. - * @param jdbcURL A valid JDBC url used to open connections. - * @param host The hostname the machine the database is hosted on. - * @param database The name of the database to connect to. - * @param user The username as part of the credentials used for authentication. - * @param password The password as part of the credentials used for authentication. - * @param language The {@link WikiConstants.Language} used for the underlying connection. - */ - public DatabaseConfiguration(String databaseDriver, String jdbcURL, String host, String database, String user, - String password, WikiConstants.Language language) { - this.host = host; - this.database = database; - this.user = user; - this.password = password; - this.language = language; - - this.setDatabaseDriver(databaseDriver); - this.setJdbcURL(jdbcURL); - } - - /** - * @return {@code True} if collation is supported by the database backend, else {@code false}. - */ - boolean supportsCollation() { - if (databaseDriver != null) { - return databaseDriver.contains("mysql") || databaseDriver.contains("mariadb"); - } else { - return false; - } - } - - /** - * @param database The name of the database. - */ - public void setDatabase(String database) { - this.database = database; - } - - /** - * @param host The host where the database is running. Set to "localhost", if the database is running locally. - */ - public void setHost(String host) { - this.host = host; - } - - /** - * @param password The password to access the database. - */ - public void setPassword(String password) { - this.password = password; - } - - /** - * @param user The database user. - */ - public void setUser(String user) { - this.user = user; - } - - /** - * @param language The language of the Wikipedia data. - */ - public void setLanguage(WikiConstants.Language language) { - this.language = language; - } - - /** - * @return The name of the database. - */ - public String getDatabase() { - return database; - } - - /** - * @return The host where the database is running. - */ - public String getHost() { - return host; - } - - /** - * @return The password to access the database. - */ - public String getPassword() { - return password; - } - - /** - * @return The database user. - */ - public String getUser() { - return user; - } - - /** - * @return The language of the Wikipedia data. - */ - public WikiConstants.Language getLanguage() { - return language; - } - - /** - * @param databaseDriver the databaseDriver to set - */ - public void setDatabaseDriver(String databaseDriver) { - this.databaseDriver = databaseDriver; - } - - /** - * @return the databaseDriver - */ - public String getDatabaseDriver() { - return databaseDriver; - } - - /** - * @param jdbcURL the jdbcURL to set - */ - public void setJdbcURL(String jdbcURL) { - this.jdbcURL = jdbcURL; - } - - /** - * @return the jdbcURL - */ - public String getJdbcURL() { - return jdbcURL; - } +public class DatabaseConfiguration +{ + + private String host; + private String database; + private String user; + private String password; + private WikiConstants.Language language; + private String jdbcURL; + private String databaseDriver; + + public DatabaseConfiguration() + { + } + + /** + * A constructor for MySQL backends, i.e. the default production setting. + * + * @param host + * The hostname the machine the database is hosted on. + * @param database + * The name of the database to connect to. + * @param user + * The username as part of the credentials used for authentication. + * @param password + * The password as part of the credentials used for authentication. + * @param language + * The {@link WikiConstants.Language} used for the underlying connection. + */ + public DatabaseConfiguration(String host, String database, String user, String password, + WikiConstants.Language language) + { + + this("com.mysql.jdbc.Driver", "jdbc:mysql://" + host + "/" + database, host, database, user, + password, language); + } + + /** + * A constructor for an explicit DBMS specific configuration. + * + * @param databaseDriver + * The fully qualified name of the JDBC driver. + * @param jdbcURL + * A valid JDBC url used to open connections. + * @param host + * The hostname the machine the database is hosted on. + * @param database + * The name of the database to connect to. + * @param user + * The username as part of the credentials used for authentication. + * @param password + * The password as part of the credentials used for authentication. + * @param language + * The {@link WikiConstants.Language} used for the underlying connection. + */ + public DatabaseConfiguration(String databaseDriver, String jdbcURL, String host, + String database, String user, String password, WikiConstants.Language language) + { + this.host = host; + this.database = database; + this.user = user; + this.password = password; + this.language = language; + + this.setDatabaseDriver(databaseDriver); + this.setJdbcURL(jdbcURL); + } + + /** + * @return {@code True} if collation is supported by the database backend, else {@code false}. + */ + boolean supportsCollation() + { + if (databaseDriver != null) { + return databaseDriver.contains("mysql") || databaseDriver.contains("mariadb"); + } + else { + return false; + } + } + + /** + * @param database + * The name of the database. + */ + public void setDatabase(String database) + { + this.database = database; + } + + /** + * @param host + * The host where the database is running. Set to "localhost", if the database is + * running locally. + */ + public void setHost(String host) + { + this.host = host; + } + + /** + * @param password + * The password to access the database. + */ + public void setPassword(String password) + { + this.password = password; + } + + /** + * @param user + * The database user. + */ + public void setUser(String user) + { + this.user = user; + } + + /** + * @param language + * The language of the Wikipedia data. + */ + public void setLanguage(WikiConstants.Language language) + { + this.language = language; + } + + /** + * @return The name of the database. + */ + public String getDatabase() + { + return database; + } + + /** + * @return The host where the database is running. + */ + public String getHost() + { + return host; + } + + /** + * @return The password to access the database. + */ + public String getPassword() + { + return password; + } + + /** + * @return The database user. + */ + public String getUser() + { + return user; + } + + /** + * @return The language of the Wikipedia data. + */ + public WikiConstants.Language getLanguage() + { + return language; + } + + /** + * @param databaseDriver + * the databaseDriver to set + */ + public void setDatabaseDriver(String databaseDriver) + { + this.databaseDriver = databaseDriver; + } + + /** + * @return the databaseDriver + */ + public String getDatabaseDriver() + { + return databaseDriver; + } + + /** + * @param jdbcURL + * the jdbcURL to set + */ + public void setJdbcURL(String jdbcURL) + { + this.jdbcURL = jdbcURL; + } + + /** + * @return the jdbcURL + */ + public String getJdbcURL() + { + return jdbcURL; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java index 0d4429a5..551156a2 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/MetaData.java @@ -24,130 +24,146 @@ /** * Provides access to meta-data about a certain {@link Wikipedia} instance. */ -public class MetaData implements WikiConstants { +public class MetaData + implements WikiConstants +{ - private final Wikipedia wiki; - private final org.dkpro.jwpl.api.hibernate.MetaData hibernateMetaData; + private final Wikipedia wiki; + private final org.dkpro.jwpl.api.hibernate.MetaData hibernateMetaData; - /** - * Creates a meta data object. - */ - protected MetaData(Wikipedia wiki) { - this.wiki = wiki; - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernateMetaData = session.createQuery("from MetaData", - org.dkpro.jwpl.api.hibernate.MetaData.class).uniqueResult(); - session.getTransaction().commit(); - } + /** + * Creates a meta data object. + */ + protected MetaData(Wikipedia wiki) + { + this.wiki = wiki; + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernateMetaData = session + .createQuery("from MetaData", org.dkpro.jwpl.api.hibernate.MetaData.class) + .uniqueResult(); + session.getTransaction().commit(); + } - /** - * @return The id of the {@link MetaData} object. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. - */ - long getId() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long id = hibernateMetaData.getId(); - session.getTransaction().commit(); - return id; - } + /** + * @return The id of the {@link MetaData} object. + */ + /* + * Note well: Access is limited to package-private here intentionally, as the database ID is + * considered framework-internal use. + */ + long getId() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long id = hibernateMetaData.getId(); + session.getTransaction().commit(); + return id; + } - /** - * @return The number of categories in the current Wikipedia. - */ - public long getNumberOfCategories() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofCategories = hibernateMetaData.getNrofCategories(); - session.getTransaction().commit(); - return nrofCategories; - } + /** + * @return The number of categories in the current Wikipedia. + */ + public long getNumberOfCategories() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofCategories = hibernateMetaData.getNrofCategories(); + session.getTransaction().commit(); + return nrofCategories; + } - /** - * @return The number of pages in the current Wikipedia. - */ - public long getNumberOfPages() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofPages = hibernateMetaData.getNrofPages(); - session.getTransaction().commit(); - return nrofPages; - } + /** + * @return The number of pages in the current Wikipedia. + */ + public long getNumberOfPages() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofPages = hibernateMetaData.getNrofPages(); + session.getTransaction().commit(); + return nrofPages; + } - /** - * @return The number of disambiguation pages in the current Wikipedia. - */ - public long getNumberOfDisambiguationPages() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofDisambPages = hibernateMetaData.getNrofDisambiguationPages(); - session.getTransaction().commit(); - return nrofDisambPages; - } + /** + * @return The number of disambiguation pages in the current Wikipedia. + */ + public long getNumberOfDisambiguationPages() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofDisambPages = hibernateMetaData.getNrofDisambiguationPages(); + session.getTransaction().commit(); + return nrofDisambPages; + } - /** - * @return The number of redirects in the current Wikipedia. - */ - public long getNumberOfRedirectPages() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - long nrofRedirects = hibernateMetaData.getNrofRedirects(); - session.getTransaction().commit(); - return nrofRedirects; - } + /** + * @return The number of redirects in the current Wikipedia. + */ + public long getNumberOfRedirectPages() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + long nrofRedirects = hibernateMetaData.getNrofRedirects(); + session.getTransaction().commit(); + return nrofRedirects; + } - /** - * @return The disambiguation {@link Category}. - * @throws WikiApiException Thrown if errors occurred fetching the information. - */ - public Category getDisambiguationCategory() throws WikiApiException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - String disambCategoryTitle = hibernateMetaData.getDisambiguationCategory(); - session.getTransaction().commit(); - return wiki.getCategory(disambCategoryTitle); - } + /** + * @return The disambiguation {@link Category}. + * @throws WikiApiException + * Thrown if errors occurred fetching the information. + */ + public Category getDisambiguationCategory() throws WikiApiException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + String disambCategoryTitle = hibernateMetaData.getDisambiguationCategory(); + session.getTransaction().commit(); + return wiki.getCategory(disambCategoryTitle); + } - /** - * @return The name of the main/root {@link Category}. - * @throws WikiApiException Thrown if errors occurred fetching the information. - */ - public Category getMainCategory() throws WikiApiException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - String mainCategoryTitle = hibernateMetaData.getMainCategory(); - session.getTransaction().commit(); - return wiki.getCategory(mainCategoryTitle); - } + /** + * @return The name of the main/root {@link Category}. + * @throws WikiApiException + * Thrown if errors occurred fetching the information. + */ + public Category getMainCategory() throws WikiApiException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + String mainCategoryTitle = hibernateMetaData.getMainCategory(); + session.getTransaction().commit(); + return wiki.getCategory(mainCategoryTitle); + } - /** - * @return The version of the wikipedia data. - * @throws WikiApiException Thrown if errors occurred fetching the information. - */ - public String getVersion() throws WikiApiException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.lock(hibernateMetaData, LockMode.NONE); - String version = hibernateMetaData.getVersion(); - session.getTransaction().commit(); - return version; - } + /** + * @return The version of the wikipedia data. + * @throws WikiApiException + * Thrown if errors occurred fetching the information. + */ + public String getVersion() throws WikiApiException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.lock(hibernateMetaData, LockMode.NONE); + String version = hibernateMetaData.getVersion(); + session.getTransaction().commit(); + return version; + } - /** - * @return The language of this wikipedia. - */ - public Language getLanguage() { - return wiki.getLanguage(); - } + /** + * @return The language of this wikipedia. + */ + public Language getLanguage() + { + return wiki.getLanguage(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java index e56c56f7..dada990d 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Page.java @@ -40,578 +40,639 @@ * Represents a Wikipedia article page. */ // Adapter class for hiding hibernate session management from the user. -public class Page implements WikiConstants { - - private final Wikipedia wiki; - - private final PageDAO pageDAO; - - // The hibernatePage that is represented by this WikiAPI page. - // The indirection is necessary to shield the user from Hibernate sessions. - private org.dkpro.jwpl.api.hibernate.Page hibernatePage; - - // If we search for a redirect, the corresponding page is delivered transparently. - // In that case, isRedirect is set to true, to indicate that. - // Note: The page itself is _not_ a redirect, it is just a page. - private boolean isRedirect = false; - - /** - * Creates a page object. - * - * @param wiki - * The wikipedia object. - * @param id - * The hibernate id of the page. - * @throws WikiApiException Thrown if errors occurred. - */ - protected Page(Wikipedia wiki, long id) throws WikiApiException { - this.wiki = wiki; - this.pageDAO = new PageDAO(wiki); - fetchByHibernateId(id); - } - - /** - * Creates a page object. - * - * @param wiki - * The wikipedia object. - * @param pageID - * The pageID of the page. - * @throws WikiApiException Thrown if errors occurred. - */ - protected Page(Wikipedia wiki, int pageID) throws WikiApiException { - this.wiki = wiki; - this.pageDAO = new PageDAO(wiki); - fetchByPageId(pageID); - } - - /** - * Creates a page object. - * - * @param wiki - * The wikipedia object. - * @param pName - * The name of the page. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page(Wikipedia wiki, String pName) throws WikiApiException { - this(wiki, pName, false); - } - - /** - * Creates a page object. - * - * @param wiki - * The wikipedia object. - * @param pName - * The name of the page. - * @param useExactTitle - * Whether to use the exact title or try to guess the correct wiki-style title. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page(Wikipedia wiki, String pName, boolean useExactTitle) throws WikiApiException { - if (pName == null || pName.length() == 0) { - throw new WikiPageNotFoundException(); - } - this.wiki = wiki; - this.pageDAO = new PageDAO(wiki); - Title pageTitle = new Title(pName); - fetchByTitle(pageTitle, useExactTitle); - } - - /** - * Creates a Page object from an already retrieved hibernate Page - * - * @param wiki - * The wikipedia object. - * @param id - * The hibernate id of the page. - * @param hibernatePage - * The {@code api.hibernate.Page} that has already been retrieved - * @throws WikiApiException Thrown if errors occurred. - */ - protected Page(Wikipedia wiki, long id, org.dkpro.jwpl.api.hibernate.Page hibernatePage) throws WikiApiException { - this.wiki = wiki; - this.pageDAO = new PageDAO(wiki); - this.hibernatePage = hibernatePage; - } - - /** - * @throws WikiApiException Thrown if errors occurred. - * @see Page - */ - private void fetchByHibernateId(long id) throws WikiApiException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernatePage = pageDAO.findById(id); - session.getTransaction().commit(); - - if (hibernatePage == null) { - throw new WikiPageNotFoundException("No page with id " + id + " was found."); - } - } - - private void fetchByPageId(int pageID) throws WikiApiException { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - hibernatePage = session.createQuery("from Page where pageId = :id", org.dkpro.jwpl.api.hibernate.Page.class) - .setParameter("id", pageID, StandardBasicTypes.INTEGER).uniqueResult(); - session.getTransaction().commit(); - - if (hibernatePage == null) { - throw new WikiPageNotFoundException("No page with page id " + pageID + " was found."); - } - } - - /** - * CAUTION: Only returns 1 result, even if several results are possible. - * - * @param pTitle - * @throws WikiApiException Thrown if errors occurred. - */ - private void fetchByTitle(Title pTitle, boolean useExactTitle) throws WikiApiException { - String searchString = pTitle.getPlainTitle(); - if (!useExactTitle) { - searchString = pTitle.getWikiStyleTitle(); - } - - Session session; - session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select pml.pageID from PageMapLine as pml where pml.name = :pagetitle LIMIT 1"; - Integer pageId = session.createNativeQuery(sql, Integer.class) - .setParameter("pagetitle", searchString, StandardBasicTypes.STRING).uniqueResult(); - session.getTransaction().commit(); - - if (pageId == null) { - throw new WikiPageNotFoundException("No page with name " + searchString + " was found."); - } - fetchByPageId(pageId); - if (!this.isRedirect&&searchString != null&&!searchString.equals(getTitle().getRawTitleText())) { - if(this.isRedirect) { - //in case we already tried to re-retrieve the discussion page unsuccessfully, - //we have to give up here or we end up in an infinite loop. - - //reasons for this happening might be several entries in PageMapLine with the same name but different upper/lower case variants - //if the database does not allow case sensitive queries, then the API will always retrieve only the first result and if this is a redirect to a different writing variant, we are stuck in a loop. - //To fix this, either a case sensitive collation should be used or the API should be able to deal with set valued results and pick the correct one from the set. - //For now, we gracefully return without retrieving the Talk page for this article and throw an appropriate excption. - throw new WikiPageNotFoundException("No discussion page with name " + searchString + " could be retrieved. This is most likely due to multiple writing variants of the same page in the database"); - } else { - this.isRedirect = true; - /* - * WORKAROUND - * in our page is a redirect to a discussion page, we might not retrieve the target discussion page as expected but rather the article associated with the target discussion page - * we check this here and re-retrieve the correct page. - * this error should be avoided by keeping the namespace information in the database - * This fix has been provided by Shiri Dori-Hacohen and is discussed in the Google Group under https://groups.google.com/forum/#!topic/jwpl/2nlr55yp87I/discussion - */ - if (searchString.startsWith(DISCUSSION_PREFIX) && !getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX)) { - try { - fetchByTitle(new Title(DISCUSSION_PREFIX + getTitle().getRawTitleText()), useExactTitle); - } catch (WikiPageNotFoundException e) { - throw new WikiPageNotFoundException("No page with name " + DISCUSSION_PREFIX + getTitle().getRawTitleText() + " was found."); - } - } - } - } - } - - /** - * @return Returns the id. - */ - /* - * Note well: - * Access is limited to package-private here intentionally, as the database ID is considered framework-internal use. - */ - long __getId() { - return hibernatePage.getId(); - } - - /** - * @return Returns a unique page id. - */ - public int getPageId() { - return hibernatePage.getPageId(); - } - - /** - * @return A set of categories that this page belongs to. - */ - public Set getCategories() { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - Set tmp = new UnmodifiableArraySet<>(hibernatePage.getCategories()); - session.getTransaction().commit(); - - Set categories = new HashSet<>(); - for (int pageID : tmp) { - categories.add(wiki.getCategory(pageID)); - } - - return categories; - } - - /** - * This is a more efficient shortcut for writing {@link Page#getCategories()}.size, as that would require - * to load all the categories first. - * - * @return The number of categories. - */ - public int getNumberOfCategories() { - int nrOfCategories = 0; - - long id = __getId(); - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(pages) from page_categories where id = :pageid"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("pageid", id, StandardBasicTypes.LONG).uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfCategories = returnValue.intValue(); - } - return nrOfCategories; - } - - /** - * Returns the set of pages that have a link pointing to this page. - *

- * Warning: Do not use - * this for getting the number of inlinks with {@link Page#getInlinks()}.size(). This is too slow. Use - * {@link Page#getNumberOfInlinks()} instead. - * - * @return The set of pages that have a link pointing to this page. - */ - public Set getInlinks() { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - // Have to copy links here since getPage later will close the session. - Set pageIDs = new UnmodifiableArraySet<>(hibernatePage.getInLinks()); - session.getTransaction().commit(); - - Set pages = new HashSet<>(); - for (int pageID : pageIDs) { - try { - pages.add(wiki.getPage(pageID)); - } - catch (WikiApiException e) { - // Silently ignore if a page could not be found - // There may be inlinks that do not come from an existing page. - } - } - - return pages; - } - - /** - * This is a more efficient shortcut for writing {@link Page#getInlinks()}.size(), as that would require to - * load all the inlinks first. - * - * @return The number of inlinks. - */ - public int getNumberOfInlinks() { - int nrOfInlinks = 0; - - long id = __getId(); - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(pi.inLinks) from page_inlinks as pi where pi.id = :piid"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("piid", id, StandardBasicTypes.LONG).uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfInlinks = returnValue.intValue(); - } - return nrOfInlinks; - } - - /** - * The result set may also contain links from non-existing pages. It is in the responsibility of - * the user to check whether the page exists. - * - * @return Returns the IDs of the inLinks of this page. - */ - public Set getInlinkIDs() { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - - Set tmpSet = new HashSet<>(hibernatePage.getInLinks()); - - session.getTransaction().commit(); - return tmpSet; - } - - /** - * Returns the set of pages that are linked from this page. Outlinks in a page might also point - * to non-existing pages. They are not included in the result set. Warning: Do not use - * this for getting the number of outlinks with {@link Page#getOutlinks()}.size(). This is too slow. Use - * {@link Page#getNumberOfOutlinks()} instead. - * - * @return The set of pages that are linked from this page. - */ - public Set getOutlinks() { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - // session.lock(hibernatePage, LockMode.NONE); - session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - // Have to copy links here since getPage later will close the session. - Set tmpSet = new UnmodifiableArraySet<>(hibernatePage.getOutLinks()); - session.getTransaction().commit(); - - Set pages = new HashSet<>(); - for (int pageID : tmpSet) { - try { - pages.add(wiki.getPage(pageID)); - } - catch (WikiApiException e) { - // Silently ignore if a page could not be found. - // There may be outlinks pointing to non-existing pages. - } - } - return pages; - } - - /** - * This is a more efficient shortcut for writing {@link Page#getOutlinks()}.size(), as that would require - * to load all the outlinks first. - * - * @return The number of outlinks. - */ - public int getNumberOfOutlinks() { - int nrOfOutlinks = 0; - - long id = __getId(); - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - String sql = "select count(outLinks) from page_outlinks where id = :id"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); - session.getTransaction().commit(); - - if (returnValue != null) { - nrOfOutlinks = returnValue.intValue(); - } - return nrOfOutlinks; - } - - /** - * The result set may also contain links from non-existing pages. It is in the responsibility of - * the user to check whether the page exists. - * - * @return Returns the IDs of the outLinks of this page. - */ - public Set getOutlinkIDs() { - - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - - Set tmpSet = new HashSet<>(hibernatePage.getOutLinks()); - - session.getTransaction().commit(); - return tmpSet; - } - - /** - * @return The title of the page. - * @throws WikiTitleParsingException Thrown if errors occurred while parsing. - */ - public Title getTitle() throws WikiTitleParsingException { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - String name = hibernatePage.getName(); - session.getTransaction().commit(); - return new Title(name); - } - - /** - * @return The set of strings that are redirects to this page. - */ - public Set getRedirects() { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); - Set tmpSet = new HashSet<>(hibernatePage.getRedirects()); - session.getTransaction().commit(); - return tmpSet; - } - - /** - * @return The text of the page with media wiki markup. - */ - public String getText() { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - String text = hibernatePage.getText(); - session.getTransaction().commit(); - - // Normalize strings read from the DB to use "\n" for all line breaks. - StringBuilder sb = new StringBuilder(text); - int t = 0; - boolean seenLineBreak = false; - char breakQue = ' '; - for (int s = 0; s < sb.length(); s++) { - char c = sb.charAt(s); - boolean isLineBreak = c == '\n' || c == '\r'; - if (isLineBreak) { - if (seenLineBreak && !(c == breakQue)) { - // This is a Windows or Mac line ending. Ignoring the second char - seenLineBreak = false; - continue; - } - else { - // Linebreak character that we cannot ignore - seenLineBreak = true; - breakQue = c; - } - } - else { - // Reset linebreak state - seenLineBreak = false; - } - - // Character needs to be copied - sb.setCharAt(t, isLineBreak ? '\n' : c); - t++; - } - sb.setLength(t); - - return sb.toString(); - } - - /** - * @return {@code True}, if the page is a disambiguation page, {@code false} otherwise. - */ - public boolean isDisambiguation() { - Session session = wiki.__getHibernateSession(); - session.beginTransaction(); - boolean isDisambiguation = hibernatePage.getIsDisambiguation(); - session.getTransaction().commit(); - return isDisambiguation; - } - - /** - * @return {@code True}, if the page was returned by querying a redirect string, {@code false} otherwise. - */ - public boolean isRedirect() { - return isRedirect; - } - - /** - * @return {@code True}, if the page is a discussion page, {@code false} otherwise. - * @throws WikiTitleParsingException Thrown if errors occurred. - */ - public boolean isDiscussion() throws WikiTitleParsingException { - return getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX); - } - - /** - *

Returns the Wikipedia article as plain text using the SwebleParser with - * a SimpleWikiConfiguration and the PlainTextConverter.
- * If you have different needs regarding the plain text, you can use - * getParsedPage(Visitor v) and provide your own Sweble-Visitor. Examples - * are in the {@code org.dkpro.jwpl.api.sweble} package - * or on http://www.sweble.org - *

- * - *

Alternatively, use {@link Page#getText()} to return the Wikipedia article - * with all Wiki markup. You can then use the old JWPL MediaWiki parser for - * creating a plain text version. The JWPL parser is now located in a - * separate project {@code org.dkpro.jwpl.api.parser}. - * Please refer to the JWPL Google Code project page for further reference.

- * - * @return The plain text of a Wikipedia article - * @throws WikiApiException Thrown if errors occurred. - */ - public String getPlainText() throws WikiApiException { - //Configure the PlainTextConverter for plain text parsing - return (String) parsePage(new PlainTextConverter(this.wiki.getWikConfig(), false, Integer.MAX_VALUE)); - } - - /** - * Parses the page with the Sweble parser using a SimpleWikiConfiguration - * and the provided visitor. For further information about the visitor - * concept, look at the examples in the - * {@code org.dkpro.jwpl.api.sweble} package, or on - * https://www.sweble.org or on the JWPL Google Code project - * page. - * - * @return the parsed page. The actual return type depends on the provided - * visitor. You have to cast the return type according to the return - * type of the go() method of your visitor. - * @throws WikiApiException Thrown if errors occurred. - */ - private Object parsePage(AstVisitor v) throws WikiApiException { - // Use the provided visitor to parse the page - return v.go(getCompiledPage().getPage()); - } - - /** - * Returns CompiledPage produced by the SWEBLE parser using the SimpleWikiConfiguration. - * - * @return the parsed page - * @throws WikiApiException Thrown if errors occurred. - */ - private EngProcessedPage getCompiledPage() throws WikiApiException { - EngProcessedPage cp; - try{ - WtEngineImpl engine = new WtEngineImpl(this.wiki.getWikConfig()); - - PageTitle pageTitle = PageTitle.make(this.wiki.getWikConfig(), this.getTitle().toString()); - PageId pageId = new PageId(pageTitle, -1); - - // Compile the retrieved page - cp = engine.postprocess(pageId, this.getText(), null); - } catch(Exception e){ - throw new WikiApiException(e); - } - return cp; - } - - - /////////////////////////////////////////////////////////////////// - /* - * The methods getInlinkAnchors() and getOutLinkAnchors() have not yet been - * migrated to the SWEBLE parser. The original versions based on the - * JWPL MediaWikiParser can be found in - * org.dkpro.jwpl.parser.LinkAnchorExtractor - */ - /////////////////////////////////////////////////////////////////// - - /** - * @return A string with infos about this page object. - * @throws WikiApiException Thrown if errors occurred. - */ - protected String getPageInfo() throws WikiApiException { - StringBuilder sb = new StringBuilder(1000); - - sb.append("ID : ").append(__getId()).append(LF); - sb.append("PageID : ").append(getPageId()).append(LF); - sb.append("Name : ").append(getTitle()).append(LF); - sb.append("Disambiguation : ").append(isDisambiguation()).append(LF); - sb.append("Redirects").append(LF); - for (String redirect : getRedirects()) { - sb.append(" ").append(redirect).append(LF); - } - sb.append("Categories").append(LF); - for (Category category : getCategories()) { - sb.append(" ").append(category.getTitle()).append(LF); - } - sb.append("In-Links").append(LF); - for (Page inLink : getInlinks()) { - sb.append(" ").append(inLink.getTitle()).append(LF); - } - sb.append("Out-Links").append(LF); - for (Page outLink : getOutlinks()) { - sb.append(" ").append(outLink.getTitle()).append(LF); - } - - return sb.toString(); - } +public class Page + implements WikiConstants +{ + + private final Wikipedia wiki; + + private final PageDAO pageDAO; + + // The hibernatePage that is represented by this WikiAPI page. + // The indirection is necessary to shield the user from Hibernate sessions. + private org.dkpro.jwpl.api.hibernate.Page hibernatePage; + + // If we search for a redirect, the corresponding page is delivered transparently. + // In that case, isRedirect is set to true, to indicate that. + // Note: The page itself is _not_ a redirect, it is just a page. + private boolean isRedirect = false; + + /** + * Creates a page object. + * + * @param wiki + * The wikipedia object. + * @param id + * The hibernate id of the page. + * @throws WikiApiException + * Thrown if errors occurred. + */ + protected Page(Wikipedia wiki, long id) throws WikiApiException + { + this.wiki = wiki; + this.pageDAO = new PageDAO(wiki); + fetchByHibernateId(id); + } + + /** + * Creates a page object. + * + * @param wiki + * The wikipedia object. + * @param pageID + * The pageID of the page. + * @throws WikiApiException + * Thrown if errors occurred. + */ + protected Page(Wikipedia wiki, int pageID) throws WikiApiException + { + this.wiki = wiki; + this.pageDAO = new PageDAO(wiki); + fetchByPageId(pageID); + } + + /** + * Creates a page object. + * + * @param wiki + * The wikipedia object. + * @param pName + * The name of the page. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Page(Wikipedia wiki, String pName) throws WikiApiException + { + this(wiki, pName, false); + } + + /** + * Creates a page object. + * + * @param wiki + * The wikipedia object. + * @param pName + * The name of the page. + * @param useExactTitle + * Whether to use the exact title or try to guess the correct wiki-style title. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Page(Wikipedia wiki, String pName, boolean useExactTitle) throws WikiApiException + { + if (pName == null || pName.length() == 0) { + throw new WikiPageNotFoundException(); + } + this.wiki = wiki; + this.pageDAO = new PageDAO(wiki); + Title pageTitle = new Title(pName); + fetchByTitle(pageTitle, useExactTitle); + } + + /** + * Creates a Page object from an already retrieved hibernate Page + * + * @param wiki + * The wikipedia object. + * @param id + * The hibernate id of the page. + * @param hibernatePage + * The {@code api.hibernate.Page} that has already been retrieved + * @throws WikiApiException + * Thrown if errors occurred. + */ + protected Page(Wikipedia wiki, long id, org.dkpro.jwpl.api.hibernate.Page hibernatePage) + throws WikiApiException + { + this.wiki = wiki; + this.pageDAO = new PageDAO(wiki); + this.hibernatePage = hibernatePage; + } + + /** + * @throws WikiApiException + * Thrown if errors occurred. + * @see Page + */ + private void fetchByHibernateId(long id) throws WikiApiException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernatePage = pageDAO.findById(id); + session.getTransaction().commit(); + + if (hibernatePage == null) { + throw new WikiPageNotFoundException("No page with id " + id + " was found."); + } + } + + private void fetchByPageId(int pageID) throws WikiApiException + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + hibernatePage = session + .createQuery("from Page where pageId = :id", + org.dkpro.jwpl.api.hibernate.Page.class) + .setParameter("id", pageID, StandardBasicTypes.INTEGER).uniqueResult(); + session.getTransaction().commit(); + + if (hibernatePage == null) { + throw new WikiPageNotFoundException("No page with page id " + pageID + " was found."); + } + } + + /** + * CAUTION: Only returns 1 result, even if several results are possible. + * + * @param pTitle + * @throws WikiApiException + * Thrown if errors occurred. + */ + private void fetchByTitle(Title pTitle, boolean useExactTitle) throws WikiApiException + { + String searchString = pTitle.getPlainTitle(); + if (!useExactTitle) { + searchString = pTitle.getWikiStyleTitle(); + } + + Session session; + session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select pml.pageID from PageMapLine as pml where pml.name = :pagetitle LIMIT 1"; + Integer pageId = session.createNativeQuery(sql, Integer.class) + .setParameter("pagetitle", searchString, StandardBasicTypes.STRING).uniqueResult(); + session.getTransaction().commit(); + + if (pageId == null) { + throw new WikiPageNotFoundException( + "No page with name " + searchString + " was found."); + } + fetchByPageId(pageId); + if (!this.isRedirect && searchString != null + && !searchString.equals(getTitle().getRawTitleText())) { + if (this.isRedirect) { + // in case we already tried to re-retrieve the discussion page unsuccessfully, + // we have to give up here or we end up in an infinite loop. + + // reasons for this happening might be several entries in PageMapLine with the same + // name but different upper/lower case variants + // if the database does not allow case sensitive queries, then the API will always + // retrieve only the first result and if this is a redirect to a different writing + // variant, we are stuck in a loop. + // To fix this, either a case sensitive collation should be used or the API should + // be able to deal with set valued results and pick the correct one from the set. + // For now, we gracefully return without retrieving the Talk page for this article + // and throw an appropriate excption. + throw new WikiPageNotFoundException("No discussion page with name " + searchString + + " could be retrieved. This is most likely due to multiple writing variants of the same page in the database"); + } + else { + this.isRedirect = true; + /* + * WORKAROUND in our page is a redirect to a discussion page, we might not retrieve + * the target discussion page as expected but rather the article associated with the + * target discussion page we check this here and re-retrieve the correct page. this + * error should be avoided by keeping the namespace information in the database This + * fix has been provided by Shiri Dori-Hacohen and is discussed in the Google Group + * under https://groups.google.com/forum/#!topic/jwpl/2nlr55yp87I/discussion + */ + if (searchString.startsWith(DISCUSSION_PREFIX) + && !getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX)) { + try { + fetchByTitle(new Title(DISCUSSION_PREFIX + getTitle().getRawTitleText()), + useExactTitle); + } + catch (WikiPageNotFoundException e) { + throw new WikiPageNotFoundException("No page with name " + DISCUSSION_PREFIX + + getTitle().getRawTitleText() + " was found."); + } + } + } + } + } + + /** + * @return Returns the id. + */ + /* + * Note well: Access is limited to package-private here intentionally, as the database ID is + * considered framework-internal use. + */ + long __getId() + { + return hibernatePage.getId(); + } + + /** + * @return Returns a unique page id. + */ + public int getPageId() + { + return hibernatePage.getPageId(); + } + + /** + * @return A set of categories that this page belongs to. + */ + public Set getCategories() + { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); + Set tmp = new UnmodifiableArraySet<>(hibernatePage.getCategories()); + session.getTransaction().commit(); + + Set categories = new HashSet<>(); + for (int pageID : tmp) { + categories.add(wiki.getCategory(pageID)); + } + + return categories; + } + + /** + * This is a more efficient shortcut for writing {@link Page#getCategories()}.size, as that + * would require to load all the categories first. + * + * @return The number of categories. + */ + public int getNumberOfCategories() + { + int nrOfCategories = 0; + + long id = __getId(); + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(pages) from page_categories where id = :pageid"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("pageid", id, StandardBasicTypes.LONG).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue != null) { + nrOfCategories = returnValue.intValue(); + } + return nrOfCategories; + } + + /** + * Returns the set of pages that have a link pointing to this page. + *

+ * Warning: Do not use this for getting the number of inlinks with + * {@link Page#getInlinks()}.size(). This is too slow. Use {@link Page#getNumberOfInlinks()} + * instead. + * + * @return The set of pages that have a link pointing to this page. + */ + public Set getInlinks() + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); + // Have to copy links here since getPage later will close the session. + Set pageIDs = new UnmodifiableArraySet<>(hibernatePage.getInLinks()); + session.getTransaction().commit(); + + Set pages = new HashSet<>(); + for (int pageID : pageIDs) { + try { + pages.add(wiki.getPage(pageID)); + } + catch (WikiApiException e) { + // Silently ignore if a page could not be found + // There may be inlinks that do not come from an existing page. + } + } + + return pages; + } + + /** + * This is a more efficient shortcut for writing {@link Page#getInlinks()}.size(), as that would + * require to load all the inlinks first. + * + * @return The number of inlinks. + */ + public int getNumberOfInlinks() + { + int nrOfInlinks = 0; + + long id = __getId(); + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(pi.inLinks) from page_inlinks as pi where pi.id = :piid"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("piid", id, StandardBasicTypes.LONG).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue != null) { + nrOfInlinks = returnValue.intValue(); + } + return nrOfInlinks; + } + + /** + * The result set may also contain links from non-existing pages. It is in the responsibility of + * the user to check whether the page exists. + * + * @return Returns the IDs of the inLinks of this page. + */ + public Set getInlinkIDs() + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); + + Set tmpSet = new HashSet<>(hibernatePage.getInLinks()); + + session.getTransaction().commit(); + return tmpSet; + } + + /** + * Returns the set of pages that are linked from this page. Outlinks in a page might also point + * to non-existing pages. They are not included in the result set. Warning: Do not use + * this for getting the number of outlinks with {@link Page#getOutlinks()}.size(). This is too + * slow. Use {@link Page#getNumberOfOutlinks()} instead. + * + * @return The set of pages that are linked from this page. + */ + public Set getOutlinks() + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + // session.lock(hibernatePage, LockMode.NONE); + session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); + // Have to copy links here since getPage later will close the session. + Set tmpSet = new UnmodifiableArraySet<>(hibernatePage.getOutLinks()); + session.getTransaction().commit(); + + Set pages = new HashSet<>(); + for (int pageID : tmpSet) { + try { + pages.add(wiki.getPage(pageID)); + } + catch (WikiApiException e) { + // Silently ignore if a page could not be found. + // There may be outlinks pointing to non-existing pages. + } + } + return pages; + } + + /** + * This is a more efficient shortcut for writing {@link Page#getOutlinks()}.size(), as that + * would require to load all the outlinks first. + * + * @return The number of outlinks. + */ + public int getNumberOfOutlinks() + { + int nrOfOutlinks = 0; + + long id = __getId(); + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + String sql = "select count(outLinks) from page_outlinks where id = :id"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("id", id, StandardBasicTypes.LONG).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue != null) { + nrOfOutlinks = returnValue.intValue(); + } + return nrOfOutlinks; + } + + /** + * The result set may also contain links from non-existing pages. It is in the responsibility of + * the user to check whether the page exists. + * + * @return Returns the IDs of the outLinks of this page. + */ + public Set getOutlinkIDs() + { + + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); + + Set tmpSet = new HashSet<>(hibernatePage.getOutLinks()); + + session.getTransaction().commit(); + return tmpSet; + } + + /** + * @return The title of the page. + * @throws WikiTitleParsingException + * Thrown if errors occurred while parsing. + */ + public Title getTitle() throws WikiTitleParsingException + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + String name = hibernatePage.getName(); + session.getTransaction().commit(); + return new Title(name); + } + + /** + * @return The set of strings that are redirects to this page. + */ + public Set getRedirects() + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + session.buildLockRequest(LockOptions.NONE).lock(hibernatePage); + Set tmpSet = new HashSet<>(hibernatePage.getRedirects()); + session.getTransaction().commit(); + return tmpSet; + } + + /** + * @return The text of the page with media wiki markup. + */ + public String getText() + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + String text = hibernatePage.getText(); + session.getTransaction().commit(); + + // Normalize strings read from the DB to use "\n" for all line breaks. + StringBuilder sb = new StringBuilder(text); + int t = 0; + boolean seenLineBreak = false; + char breakQue = ' '; + for (int s = 0; s < sb.length(); s++) { + char c = sb.charAt(s); + boolean isLineBreak = c == '\n' || c == '\r'; + if (isLineBreak) { + if (seenLineBreak && !(c == breakQue)) { + // This is a Windows or Mac line ending. Ignoring the second char + seenLineBreak = false; + continue; + } + else { + // Linebreak character that we cannot ignore + seenLineBreak = true; + breakQue = c; + } + } + else { + // Reset linebreak state + seenLineBreak = false; + } + + // Character needs to be copied + sb.setCharAt(t, isLineBreak ? '\n' : c); + t++; + } + sb.setLength(t); + + return sb.toString(); + } + + /** + * @return {@code True}, if the page is a disambiguation page, {@code false} otherwise. + */ + public boolean isDisambiguation() + { + Session session = wiki.__getHibernateSession(); + session.beginTransaction(); + boolean isDisambiguation = hibernatePage.getIsDisambiguation(); + session.getTransaction().commit(); + return isDisambiguation; + } + + /** + * @return {@code True}, if the page was returned by querying a redirect string, {@code false} + * otherwise. + */ + public boolean isRedirect() + { + return isRedirect; + } + + /** + * @return {@code True}, if the page is a discussion page, {@code false} otherwise. + * @throws WikiTitleParsingException + * Thrown if errors occurred. + */ + public boolean isDiscussion() throws WikiTitleParsingException + { + return getTitle().getRawTitleText().startsWith(DISCUSSION_PREFIX); + } + + /** + *

+ * Returns the Wikipedia article as plain text using the SwebleParser with a + * SimpleWikiConfiguration and the PlainTextConverter.
+ * If you have different needs regarding the plain text, you can use getParsedPage(Visitor v) + * and provide your own Sweble-Visitor. Examples are in the {@code org.dkpro.jwpl.api.sweble} + * package or on http://www.sweble.org + *

+ * + *

+ * Alternatively, use {@link Page#getText()} to return the Wikipedia article with all Wiki + * markup. You can then use the old JWPL MediaWiki parser for creating a plain text version. The + * JWPL parser is now located in a separate project {@code org.dkpro.jwpl.api.parser}. Please + * refer to the JWPL Google Code project page for further reference. + *

+ * + * @return The plain text of a Wikipedia article + * @throws WikiApiException + * Thrown if errors occurred. + */ + public String getPlainText() throws WikiApiException + { + // Configure the PlainTextConverter for plain text parsing + return (String) parsePage( + new PlainTextConverter(this.wiki.getWikConfig(), false, Integer.MAX_VALUE)); + } + + /** + * Parses the page with the Sweble parser using a SimpleWikiConfiguration and the provided + * visitor. For further information about the visitor concept, look at the examples in the + * {@code org.dkpro.jwpl.api.sweble} package, or on + * https://www.sweble.org or on the JWPL Google Code + * project page. + * + * @return the parsed page. The actual return type depends on the provided visitor. You have to + * cast the return type according to the return type of the go() method of your visitor. + * @throws WikiApiException + * Thrown if errors occurred. + */ + private Object parsePage(AstVisitor v) throws WikiApiException + { + // Use the provided visitor to parse the page + return v.go(getCompiledPage().getPage()); + } + + /** + * Returns CompiledPage produced by the SWEBLE parser using the SimpleWikiConfiguration. + * + * @return the parsed page + * @throws WikiApiException + * Thrown if errors occurred. + */ + private EngProcessedPage getCompiledPage() throws WikiApiException + { + EngProcessedPage cp; + try { + WtEngineImpl engine = new WtEngineImpl(this.wiki.getWikConfig()); + + PageTitle pageTitle = PageTitle.make(this.wiki.getWikConfig(), + this.getTitle().toString()); + PageId pageId = new PageId(pageTitle, -1); + + // Compile the retrieved page + cp = engine.postprocess(pageId, this.getText(), null); + } + catch (Exception e) { + throw new WikiApiException(e); + } + return cp; + } + + /////////////////////////////////////////////////////////////////// + /* + * The methods getInlinkAnchors() and getOutLinkAnchors() have not yet been migrated to the + * SWEBLE parser. The original versions based on the JWPL MediaWikiParser can be found in + * org.dkpro.jwpl.parser.LinkAnchorExtractor + */ + /////////////////////////////////////////////////////////////////// + + /** + * @return A string with infos about this page object. + * @throws WikiApiException + * Thrown if errors occurred. + */ + protected String getPageInfo() throws WikiApiException + { + StringBuilder sb = new StringBuilder(1000); + + sb.append("ID : ").append(__getId()).append(LF); + sb.append("PageID : ").append(getPageId()).append(LF); + sb.append("Name : ").append(getTitle()).append(LF); + sb.append("Disambiguation : ").append(isDisambiguation()).append(LF); + sb.append("Redirects").append(LF); + for (String redirect : getRedirects()) { + sb.append(" ").append(redirect).append(LF); + } + sb.append("Categories").append(LF); + for (Category category : getCategories()) { + sb.append(" ").append(category.getTitle()).append(LF); + } + sb.append("In-Links").append(LF); + for (Page inLink : getInlinks()) { + sb.append(" ").append(inLink.getTitle()).append(LF); + } + sb.append("Out-Links").append(LF); + for (Page outLink : getOutlinks()) { + sb.append(" ").append(outLink.getTitle()).append(LF); + } + + return sb.toString(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java index f0da621d..2f5de70c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterable.java @@ -22,38 +22,40 @@ /** * An {@link Iterable} of {@link Page} objects. */ -public class PageIterable implements Iterable { - - private final Wikipedia wiki; - - /* - * Whether only articles are retrieved (or also disambiguation pages) - */ - private final boolean onlyArticles; - - /* - * The size of the page buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single article. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 500. - */ - private int bufferSize = 500; - - public PageIterable(Wikipedia wiki, boolean onlyArticles) { - this.wiki = wiki; - this.onlyArticles = onlyArticles; - } - - protected PageIterable(Wikipedia wiki, boolean onlyArticles, int bufferSize) { - this.wiki = wiki; - this.onlyArticles = onlyArticles; - this.bufferSize = bufferSize; - } - - @Override - public Iterator iterator() { - return new PageIterator(wiki, onlyArticles, bufferSize); - } +public class PageIterable + implements Iterable +{ + + private final Wikipedia wiki; + + /* + * Whether only articles are retrieved (or also disambiguation pages) + */ + private final boolean onlyArticles; + + /* + * The size of the page buffer. With bufferSize = 1, a database connection is needed for + * retrieving a single article. Higher bufferSize gives better performance, but needs memory. + * Initialize it with 500. + */ + private int bufferSize = 500; + + public PageIterable(Wikipedia wiki, boolean onlyArticles) + { + this.wiki = wiki; + this.onlyArticles = onlyArticles; + } + + protected PageIterable(Wikipedia wiki, boolean onlyArticles, int bufferSize) + { + this.wiki = wiki; + this.onlyArticles = onlyArticles; + this.bufferSize = bufferSize; + } + + @Override + public Iterator iterator() + { + return new PageIterator(wiki, onlyArticles, bufferSize); + } } - - diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java index 41416219..a4fd0193 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageIterator.java @@ -34,212 +34,241 @@ /** * An {@link Iterator} over {@link Page} objects. */ -public class PageIterator implements Iterator { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final PageBuffer buffer; - - public PageIterator(Wikipedia wiki, Set ids, Set titles, int bufferSize) { - buffer = new PageBuffer(bufferSize, wiki, ids, titles); - } - - public PageIterator(Wikipedia wiki, boolean onlyArticles, int bufferSize) { - buffer = new PageBuffer(bufferSize, wiki, onlyArticles); - } - - @Override - public boolean hasNext() { - return buffer.hasNext(); - } - - @Override - public Page next() { - return buffer.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * Buffers {@link Page pages} in a list. - */ - static class PageBuffer { - - private final Wikipedia wiki; - private final boolean onlyArticles; - - private final List buffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private long lastPage;// the overall offset in the data - - private List pageIds = new LinkedList<>(); // a set of ids, if a specific list of articles is supposed to be read - private List pageTitles = new LinkedList<>(); // a set of titles, if a specific list of articles is supposed to be read - final boolean loadFromList; - - public PageBuffer(int bufferSize, Wikipedia wiki, boolean onlyArticles) { - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.onlyArticles = onlyArticles; - this.buffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.lastPage = 0; - this.loadFromList = false; - //TODO test whether this works when zero pages are retrieved +public class PageIterator + implements Iterator +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private final PageBuffer buffer; + + public PageIterator(Wikipedia wiki, Set ids, Set titles, int bufferSize) + { + buffer = new PageBuffer(bufferSize, wiki, ids, titles); } - public PageBuffer(int bufferSize, Wikipedia wiki, Set ids, Set titles) { - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.buffer = new ArrayList<>(); - this.onlyArticles = false; - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.lastPage = 0; - this.pageIds = new LinkedList<>(ids); - this.pageTitles = new LinkedList<>(titles); - this.loadFromList = true; + public PageIterator(Wikipedia wiki, boolean onlyArticles, int bufferSize) + { + buffer = new PageBuffer(bufferSize, wiki, onlyArticles); } - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext() { - if (bufferOffset < bufferFillSize) { - return true; - } else { - return this.fillBuffer(); - } + @Override + public boolean hasNext() + { + return buffer.hasNext(); } - /** - * @return The next {@link Page} or {@code null} if no more pages are available. - */ - public Page next() { - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } else { - // if it cannot be filled => return null - return null; - } + @Override + public Page next() + { + return buffer.next(); } - private Page getBufferElement() { - Page page = buffer.get(bufferOffset); - bufferOffset++; - return page; + @Override + public void remove() + { + throw new UnsupportedOperationException(); } - private boolean fillBuffer() { + /** + * Buffers {@link Page pages} in a list. + */ + static class PageBuffer + { + + private final Wikipedia wiki; + private final boolean onlyArticles; - //decide whether to load from list or retrieve all available articles - if (loadFromList) { - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; + private final List buffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the + // database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private long lastPage;// the overall offset in the data - //load pages - if (pageIds.isEmpty() && pageTitles.isEmpty()) { - return false; + private List pageIds = new LinkedList<>(); // a set of ids, if a specific list of + // articles is supposed to be read + private List pageTitles = new LinkedList<>(); // a set of titles, if a specific list + // of articles is supposed to be read + final boolean loadFromList; + + public PageBuffer(int bufferSize, Wikipedia wiki, boolean onlyArticles) + { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.onlyArticles = onlyArticles; + this.buffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.lastPage = 0; + this.loadFromList = false; + // TODO test whether this works when zero pages are retrieved } - while (bufferFillSize <= maxBufferSize && !pageIds.isEmpty()) { - String id = pageIds.remove(0); - if (id != null && !id.isEmpty()) { - try { - buffer.add(wiki.getPage(Integer.parseInt(id))); - bufferFillSize++; - } catch (WikiApiException e) { - logger.warn("Missing article with id " + id); - } - } + public PageBuffer(int bufferSize, Wikipedia wiki, Set ids, Set titles) + { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.buffer = new ArrayList<>(); + this.onlyArticles = false; + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.lastPage = 0; + this.pageIds = new LinkedList<>(ids); + this.pageTitles = new LinkedList<>(titles); + this.loadFromList = true; } - while (bufferFillSize <= maxBufferSize && !pageTitles.isEmpty()) { - String title = pageTitles.remove(0); - if (title != null && !title.isEmpty()) { - try { - buffer.add(wiki.getPage(title)); - bufferFillSize++; - } catch (WikiApiException e) { - logger.warn("Missing article with title \"" + title + "\""); + + /** + * If there are elements in the buffer left, then return true. If the end of the filled + * buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() + { + if (bufferOffset < bufferFillSize) { + return true; + } + else { + return this.fillBuffer(); } - } } - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } else { - return false; + /** + * @return The next {@link Page} or {@code null} if no more pages are available. + */ + public Page next() + { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } + else { + // if it cannot be filled => return null + return null; + } } - } else { - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - List returnValues; - TypedQuery query; - String sql; - if (onlyArticles) { - sql = "SELECT p FROM Page p WHERE p.isDisambiguation = :isDisambiguation AND p.id > :pageId"; - query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); - query.setParameter("isDisambiguation", false); - query.setParameter("pageId", lastPage); - } else { - sql = "SELECT p FROM Page p WHERE p.id > :pageId"; - query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); - query.setParameter("pageId", lastPage); + + private Page getBufferElement() + { + Page page = buffer.get(bufferOffset); + bufferOffset++; + return page; } - query.setMaxResults(maxBufferSize); - returnValues = query.getResultList(); - session.getTransaction().commit(); - - // clear the old buffer and all variables regarding the state of the buffer - buffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - Page apiPage; - for (org.dkpro.jwpl.api.hibernate.Page o : returnValues) { - if (o == null) { - return false; - } else { - long id = o.getId(); - try { - apiPage = new Page(this.wiki, id, o); - if (this.onlyArticles) { - if (!apiPage.isRedirect()) { - buffer.add(apiPage); + + private boolean fillBuffer() + { + + // decide whether to load from list or retrieve all available articles + if (loadFromList) { + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + // load pages + if (pageIds.isEmpty() && pageTitles.isEmpty()) { + return false; + } + + while (bufferFillSize <= maxBufferSize && !pageIds.isEmpty()) { + String id = pageIds.remove(0); + if (id != null && !id.isEmpty()) { + try { + buffer.add(wiki.getPage(Integer.parseInt(id))); + bufferFillSize++; + } + catch (WikiApiException e) { + logger.warn("Missing article with id " + id); + } + } + } + while (bufferFillSize <= maxBufferSize && !pageTitles.isEmpty()) { + String title = pageTitles.remove(0); + if (title != null && !title.isEmpty()) { + try { + buffer.add(wiki.getPage(title)); + bufferFillSize++; + } + catch (WikiApiException e) { + logger.warn("Missing article with title \"" + title + "\""); + } + } + } + + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } + else { + return false; } - } else { - buffer.add(apiPage); - } - } catch (WikiApiException e) { - logger.error("Page with hibernateID " + id + " not found."); } - lastPage = id; - } - } - if (buffer.size() > 0) { - bufferFillSize = buffer.size(); - return true; - } else { - return false; - } - } - } // fillBuffer + else { + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + List returnValues; + TypedQuery query; + String sql; + if (onlyArticles) { + sql = "SELECT p FROM Page p WHERE p.isDisambiguation = :isDisambiguation AND p.id > :pageId"; + query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); + query.setParameter("isDisambiguation", false); + query.setParameter("pageId", lastPage); + } + else { + sql = "SELECT p FROM Page p WHERE p.id > :pageId"; + query = session.createQuery(sql, org.dkpro.jwpl.api.hibernate.Page.class); + query.setParameter("pageId", lastPage); + } + query.setMaxResults(maxBufferSize); + returnValues = query.getResultList(); + session.getTransaction().commit(); + + // clear the old buffer and all variables regarding the state of the buffer + buffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; - } + Page apiPage; + for (org.dkpro.jwpl.api.hibernate.Page o : returnValues) { + if (o == null) { + return false; + } + else { + long id = o.getId(); + try { + apiPage = new Page(this.wiki, id, o); + if (this.onlyArticles) { + if (!apiPage.isRedirect()) { + buffer.add(apiPage); + } + } + else { + buffer.add(apiPage); + } + } + catch (WikiApiException e) { + logger.error("Page with hibernateID " + id + " not found."); + } + lastPage = id; + } + } + if (buffer.size() > 0) { + bufferFillSize = buffer.size(); + return true; + } + else { + return false; + } + } + } // fillBuffer + + } } \ No newline at end of file diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java index 586343fb..343640d9 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQuery.java @@ -18,286 +18,329 @@ package org.dkpro.jwpl.api; /** - * Represents a query for retrieving pages that meet the given conditions. - * Conditions are represented by the fields of a query object. + * Represents a query for retrieving pages that meet the given conditions. Conditions are + * represented by the fields of a query object. */ -public class PageQuery implements WikiConstants { - - /** - * Whether only article pages should be retrieved. - */ - private boolean onlyArticlePages; - - /** - * Whether only disambiguation pages should be retrieved. - */ - private boolean onlyDisambiguationPages; - - /** - * A regular expression style titlePattern for the page's title - */ - private String titlePattern; - - /** - * The minimum in-degree of the page - */ - private int minIndegree; - /** - * The maximum out-degree of the page - */ - private int maxIndegree; - - /** - * The minimum out-degree of the page - */ - private int minOutdegree; - /** - * The maximum out-degree of the page - */ - private int maxOutdegree; - - /** - * The minimum number of redirects of the page - */ - private int minRedirects; - /** - * The maximum number of redirects of the page - */ - private int maxRedirects; - - /** - * The minimum number of categories of the page - */ - private int minCategories; - /** - * The maximum number of categories of the page - */ - private int maxCategories; - - /** - * The minimum number of tokens in the page - */ - private int minTokens; - /** - * The minimum number of tokens in the page - */ - private int maxTokens; - - public PageQuery() { - onlyDisambiguationPages = false; - - titlePattern = ""; - - minIndegree = 0; - maxIndegree = Integer.MAX_VALUE; - - minOutdegree = 0; - maxOutdegree = Integer.MAX_VALUE; - - minRedirects = 0; - maxRedirects = Integer.MAX_VALUE; - - minCategories = 0; - maxCategories = Integer.MAX_VALUE; - - minTokens = 0; - maxTokens = Integer.MAX_VALUE; - - } - - protected int getMaxCategories() { - return maxCategories; - } - - protected int getMaxIndegree() { - return maxIndegree; - } - - protected int getMaxOutdegree() { - return maxOutdegree; - } - - protected int getMaxRedirects() { - return maxRedirects; - } - - protected int getMinCategories() { - return minCategories; - } - - protected int getMinIndegree() { - return minIndegree; - } - - protected int getMinOutdegree() { - return minOutdegree; - } - - protected int getMinRedirects() { - return minRedirects; - } - - protected boolean onlyArticlePages() { - return onlyArticlePages; - } - - protected boolean onlyDisambiguationPages() { - return onlyDisambiguationPages; - } - - protected int getMinTokens() { - return minTokens; - } - - protected int getMaxTokens() { - return maxTokens; - } - - protected String getTitlePattern() { - return titlePattern; - } - - /** - * Sets the minimum number of categories that queried articles should have. - * - * @param minCategories The minimum number of categories. - */ - public void setMinCategories(int minCategories) { - this.minCategories = minCategories; - } - - /** - * Sets the maximum number of categories that queried articles should have. - * - * @param maxCategories The maximum number of categories. - */ - public void setMaxCategories(int maxCategories) { - this.maxCategories = maxCategories; - } - - /** - * Sets the minimum number of ingoing links that queried articles should have. - * - * @param minIndegree The minimum number of ingoing links. - */ - public void setMinIndegree(int minIndegree) { - this.minIndegree = minIndegree; - } - - /** - * Sets the maximum number of ingoing links that queried articles should have. - * - * @param maxIndegree The maximum number of ingoing links. - */ - public void setMaxIndegree(int maxIndegree) { - this.maxIndegree = maxIndegree; - } - - /** - * Sets the minimum number of outgoing links that queried articles should have. - * - * @param minOutdegree The minimum number of outgoing links. - */ - public void setMinOutdegree(int minOutdegree) { - this.minOutdegree = minOutdegree; - } - - /** - * Sets the maximum number of outgoing links that queried articles should have. - * - * @param maxOutdegree The maximum number of outgoing links. - */ - public void setMaxOutdegree(int maxOutdegree) { - this.maxOutdegree = maxOutdegree; - } - - /** - * Sets the minimum number of redirects that queried articles should have. - * - * @param minRedirects The minimum number of redirects. - */ - public void setMinRedirects(int minRedirects) { - this.minRedirects = minRedirects; - } - - /** - * Sets the maximum number of redirects that queried articles should have. - * - * @param maxRedirects The maximum number of redirects. - */ - public void setMaxRedirects(int maxRedirects) { - this.maxRedirects = maxRedirects; - } - - /** - * Sets whether only be articles should be retrieved. - * - * @param onlyArticlePages If set to true, only article pages are returned. - */ - public void setOnlyArticlePages(boolean onlyArticlePages) { - this.onlyArticlePages = onlyArticlePages; - } - - /** - * Sets whether only disambiguation pages should be retrieved. - * - * @param onlyDisambiguationPages If set to true, only disambiguation pages are returned. - */ - public void setOnlyDisambiguationPages(boolean onlyDisambiguationPages) { - this.onlyDisambiguationPages = onlyDisambiguationPages; - } - - /** - * Sets the minimum number of tokens that queried articles should have. - * - * @param minTokens The minimum number of tokens. - */ - public void setMinTokens(int minTokens) { - this.minTokens = minTokens; - } - - /** - * Sets the maximum number of tokens that queried articles should have. - * - * @param maxTokens The maximum number of tokens. - */ - public void setMaxTokens(int maxTokens) { - this.maxTokens = maxTokens; - } - - /** - * Sets a regular expression that pages have to match. - * % for any number of arbitrary characters (can only be used at the end of a string) - * _ for a single arbitrary character (can also be used inside a string) - * - * @param pattern A regular expression pattern. - */ - public void setTitlePattern(String pattern) { - this.titlePattern = pattern; - } - - /** - * @return A string that shows the current values of the query members. - * @deprecated To be removed without replacement. - */ - @Deprecated(since="2.0.0", forRemoval=true) - public String getQueryInfo() { - StringBuilder sb = new StringBuilder(); - - sb.append("MaxCategories: ").append(maxCategories).append(LF); - sb.append("MinCategories: ").append(minCategories).append(LF); - sb.append("MaxIndegree: ").append(maxIndegree).append(LF); - sb.append("MinIndegree: ").append(minIndegree).append(LF); - sb.append("MaxOutdegree: ").append(maxOutdegree).append(LF); - sb.append("MinOutdegree: ").append(minOutdegree).append(LF); - sb.append("MaxRedirects: ").append(maxRedirects).append(LF); - sb.append("MinRedirects: ").append(minRedirects).append(LF); - sb.append("MaxTokens: ").append(maxTokens).append(LF); - sb.append("MinTokens: ").append(minTokens).append(LF); - sb.append("Only article pages: ").append(onlyArticlePages).append(LF); - sb.append("Only disambiguation pages: ").append(onlyDisambiguationPages).append(LF); - sb.append("Title pattern: ").append(titlePattern).append(LF); - - return sb.toString(); - } +public class PageQuery + implements WikiConstants +{ + + /** + * Whether only article pages should be retrieved. + */ + private boolean onlyArticlePages; + + /** + * Whether only disambiguation pages should be retrieved. + */ + private boolean onlyDisambiguationPages; + + /** + * A regular expression style titlePattern for the page's title + */ + private String titlePattern; + + /** + * The minimum in-degree of the page + */ + private int minIndegree; + /** + * The maximum out-degree of the page + */ + private int maxIndegree; + + /** + * The minimum out-degree of the page + */ + private int minOutdegree; + /** + * The maximum out-degree of the page + */ + private int maxOutdegree; + + /** + * The minimum number of redirects of the page + */ + private int minRedirects; + /** + * The maximum number of redirects of the page + */ + private int maxRedirects; + + /** + * The minimum number of categories of the page + */ + private int minCategories; + /** + * The maximum number of categories of the page + */ + private int maxCategories; + + /** + * The minimum number of tokens in the page + */ + private int minTokens; + /** + * The minimum number of tokens in the page + */ + private int maxTokens; + + public PageQuery() + { + onlyDisambiguationPages = false; + + titlePattern = ""; + + minIndegree = 0; + maxIndegree = Integer.MAX_VALUE; + + minOutdegree = 0; + maxOutdegree = Integer.MAX_VALUE; + + minRedirects = 0; + maxRedirects = Integer.MAX_VALUE; + + minCategories = 0; + maxCategories = Integer.MAX_VALUE; + + minTokens = 0; + maxTokens = Integer.MAX_VALUE; + + } + + protected int getMaxCategories() + { + return maxCategories; + } + + protected int getMaxIndegree() + { + return maxIndegree; + } + + protected int getMaxOutdegree() + { + return maxOutdegree; + } + + protected int getMaxRedirects() + { + return maxRedirects; + } + + protected int getMinCategories() + { + return minCategories; + } + + protected int getMinIndegree() + { + return minIndegree; + } + + protected int getMinOutdegree() + { + return minOutdegree; + } + + protected int getMinRedirects() + { + return minRedirects; + } + + protected boolean onlyArticlePages() + { + return onlyArticlePages; + } + + protected boolean onlyDisambiguationPages() + { + return onlyDisambiguationPages; + } + + protected int getMinTokens() + { + return minTokens; + } + + protected int getMaxTokens() + { + return maxTokens; + } + + protected String getTitlePattern() + { + return titlePattern; + } + + /** + * Sets the minimum number of categories that queried articles should have. + * + * @param minCategories + * The minimum number of categories. + */ + public void setMinCategories(int minCategories) + { + this.minCategories = minCategories; + } + + /** + * Sets the maximum number of categories that queried articles should have. + * + * @param maxCategories + * The maximum number of categories. + */ + public void setMaxCategories(int maxCategories) + { + this.maxCategories = maxCategories; + } + + /** + * Sets the minimum number of ingoing links that queried articles should have. + * + * @param minIndegree + * The minimum number of ingoing links. + */ + public void setMinIndegree(int minIndegree) + { + this.minIndegree = minIndegree; + } + + /** + * Sets the maximum number of ingoing links that queried articles should have. + * + * @param maxIndegree + * The maximum number of ingoing links. + */ + public void setMaxIndegree(int maxIndegree) + { + this.maxIndegree = maxIndegree; + } + + /** + * Sets the minimum number of outgoing links that queried articles should have. + * + * @param minOutdegree + * The minimum number of outgoing links. + */ + public void setMinOutdegree(int minOutdegree) + { + this.minOutdegree = minOutdegree; + } + + /** + * Sets the maximum number of outgoing links that queried articles should have. + * + * @param maxOutdegree + * The maximum number of outgoing links. + */ + public void setMaxOutdegree(int maxOutdegree) + { + this.maxOutdegree = maxOutdegree; + } + + /** + * Sets the minimum number of redirects that queried articles should have. + * + * @param minRedirects + * The minimum number of redirects. + */ + public void setMinRedirects(int minRedirects) + { + this.minRedirects = minRedirects; + } + + /** + * Sets the maximum number of redirects that queried articles should have. + * + * @param maxRedirects + * The maximum number of redirects. + */ + public void setMaxRedirects(int maxRedirects) + { + this.maxRedirects = maxRedirects; + } + + /** + * Sets whether only be articles should be retrieved. + * + * @param onlyArticlePages + * If set to true, only article pages are returned. + */ + public void setOnlyArticlePages(boolean onlyArticlePages) + { + this.onlyArticlePages = onlyArticlePages; + } + + /** + * Sets whether only disambiguation pages should be retrieved. + * + * @param onlyDisambiguationPages + * If set to true, only disambiguation pages are returned. + */ + public void setOnlyDisambiguationPages(boolean onlyDisambiguationPages) + { + this.onlyDisambiguationPages = onlyDisambiguationPages; + } + + /** + * Sets the minimum number of tokens that queried articles should have. + * + * @param minTokens + * The minimum number of tokens. + */ + public void setMinTokens(int minTokens) + { + this.minTokens = minTokens; + } + + /** + * Sets the maximum number of tokens that queried articles should have. + * + * @param maxTokens + * The maximum number of tokens. + */ + public void setMaxTokens(int maxTokens) + { + this.maxTokens = maxTokens; + } + + /** + * Sets a regular expression that pages have to match. % for any number of arbitrary characters + * (can only be used at the end of a string) _ for a single arbitrary character (can also be + * used inside a string) + * + * @param pattern + * A regular expression pattern. + */ + public void setTitlePattern(String pattern) + { + this.titlePattern = pattern; + } + + /** + * @return A string that shows the current values of the query members. + * @deprecated To be removed without replacement. + */ + @Deprecated(since = "2.0.0", forRemoval = true) + public String getQueryInfo() + { + StringBuilder sb = new StringBuilder(); + + sb.append("MaxCategories: ").append(maxCategories).append(LF); + sb.append("MinCategories: ").append(minCategories).append(LF); + sb.append("MaxIndegree: ").append(maxIndegree).append(LF); + sb.append("MinIndegree: ").append(minIndegree).append(LF); + sb.append("MaxOutdegree: ").append(maxOutdegree).append(LF); + sb.append("MinOutdegree: ").append(minOutdegree).append(LF); + sb.append("MaxRedirects: ").append(maxRedirects).append(LF); + sb.append("MinRedirects: ").append(minRedirects).append(LF); + sb.append("MaxTokens: ").append(maxTokens).append(LF); + sb.append("MinTokens: ").append(minTokens).append(LF); + sb.append("Only article pages: ").append(onlyArticlePages).append(LF); + sb.append("Only disambiguation pages: ").append(onlyDisambiguationPages).append(LF); + sb.append("Title pattern: ").append(titlePattern).append(LF); + + return sb.toString(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java index dcf3f6b3..f33085a5 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterable.java @@ -33,133 +33,143 @@ /** * An iterable over {@link Page} objects selected by a query. */ -public class PageQueryIterable implements Iterable { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final Wikipedia wiki; - private final List pageIdList; - - public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException { - - this.wiki = wiki; - this.pageIdList = new ArrayList<>(); - - // get a list with all pageIDs of the pages conforming with the query - String hql = "select p.pageId from Page as p "; - List conditions = new ArrayList<>(); - if (q.onlyDisambiguationPages()) { - conditions.add("p.isDisambiguation = 1"); - } - if (q.onlyArticlePages()) { - conditions.add("p.isDisambiguation = 0"); - } - if (!"".equals(q.getTitlePattern())) { - conditions.add("p.name like '" + q.getTitlePattern() + "'"); +public class PageQueryIterable + implements Iterable +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private final Wikipedia wiki; + private final List pageIdList; + + public PageQueryIterable(Wikipedia wiki, PageQuery q) throws WikiApiException + { + + this.wiki = wiki; + this.pageIdList = new ArrayList<>(); + + // get a list with all pageIDs of the pages conforming with the query + String hql = "select p.pageId from Page as p "; + List conditions = new ArrayList<>(); + if (q.onlyDisambiguationPages()) { + conditions.add("p.isDisambiguation = 1"); + } + if (q.onlyArticlePages()) { + conditions.add("p.isDisambiguation = 0"); + } + if (!"".equals(q.getTitlePattern())) { + conditions.add("p.name like '" + q.getTitlePattern() + "'"); + } + + String conditionString = StringUtils.join(conditions, " AND "); + if (conditionString.length() > 0) { + hql += "where " + conditionString; + } + + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + List idList = session.createQuery(hql, Integer.class).list(); + session.getTransaction().commit(); + + int progress = 0; + for (Integer pageID : idList) { + progress++; + ApiUtilities.printProgressInfo(progress, idList.size(), 100, + ApiUtilities.ProgressInfoMode.TEXT, + "searching " + idList.size() + " pages ... "); + + // shortcut to fasten queries that do not have such constraints + if (q.getMaxCategories() == Integer.MAX_VALUE && q.getMaxIndegree() == Integer.MAX_VALUE + && q.getMaxOutdegree() == Integer.MAX_VALUE + && q.getMaxRedirects() == Integer.MAX_VALUE + && q.getMaxTokens() == Integer.MAX_VALUE && q.getMinCategories() == 0 + && q.getMinIndegree() == 0 && q.getMinOutdegree() == 0 + && q.getMinRedirects() == 0 && q.getMinTokens() == 0) { + pageIdList.add(pageID); + continue; + } + + Page page = null; + try { + page = wiki.getPage(pageID); + } + catch (WikiPageNotFoundException e) { + logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", + pageID); + e.printStackTrace(); + System.exit(1); + } + + String[] tokens = page.getPlainText().split(" "); + + if (!(q.getMinIndegree() >= 0 && q.getMaxIndegree() >= 0 + && q.getMinIndegree() <= q.getMaxIndegree())) { + q.setMinIndegree(0); + q.setMaxIndegree(Integer.MAX_VALUE); + } + + if (!(q.getMinOutdegree() >= 0 && q.getMaxOutdegree() >= 0 + && q.getMinOutdegree() <= q.getMaxOutdegree())) { + q.setMinOutdegree(0); + q.setMaxOutdegree(Integer.MAX_VALUE); + } + + if (!(q.getMinRedirects() >= 0 && q.getMaxRedirects() >= 0 + && q.getMinRedirects() <= q.getMaxRedirects())) { + q.setMinRedirects(0); + q.setMaxRedirects(Integer.MAX_VALUE); + } + + if (!(q.getMinCategories() >= 0 && q.getMaxCategories() >= 0 + && q.getMinCategories() <= q.getMaxCategories())) { + q.setMinCategories(0); + q.setMaxCategories(Integer.MAX_VALUE); + } + + if (!(q.getMinCategories() >= 0 && q.getMaxCategories() >= 0 + && q.getMinCategories() <= q.getMaxCategories())) { + q.setMinCategories(0); + q.setMaxCategories(Integer.MAX_VALUE); + } + + if (!(q.getMinTokens() >= 0 && q.getMaxTokens() >= 0 + && q.getMinTokens() <= q.getMaxTokens())) { + q.setMinTokens(0); + q.setMaxTokens(Integer.MAX_VALUE); + } + + int inlinkSize = page.getNumberOfInlinks(); + if (inlinkSize < q.getMinIndegree() || inlinkSize > q.getMaxIndegree()) { + continue; + } + + int outlinkSize = page.getNumberOfOutlinks(); + if (outlinkSize < q.getMinOutdegree() || outlinkSize > q.getMaxOutdegree()) { + continue; + } + if (page.getRedirects().size() < q.getMinRedirects() + || page.getRedirects().size() > q.getMaxRedirects()) { + continue; + } + + int categoriesSize = page.getCategories().size(); + if (categoriesSize < q.getMinCategories() || categoriesSize > q.getMaxCategories()) { + continue; + } + if (tokens.length < q.getMinTokens() || tokens.length > q.getMaxTokens()) { + continue; + } + + // if still here, add page + pageIdList.add(pageID); + } // for + logger.info("Query selected {} pages.", pageIdList.size()); } - String conditionString = StringUtils.join(conditions, " AND "); - if (conditionString.length() > 0) { - hql += "where " + conditionString; + @Override + public Iterator iterator() + { + return new PageQueryIterator(wiki, pageIdList); } - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - List idList = session.createQuery(hql, Integer.class).list(); - session.getTransaction().commit(); - - int progress = 0; - for (Integer pageID : idList) { - progress++; - ApiUtilities.printProgressInfo(progress, idList.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "searching " + idList.size() + " pages ... "); - - // shortcut to fasten queries that do not have such constraints - if (q.getMaxCategories() == Integer.MAX_VALUE && - q.getMaxIndegree() == Integer.MAX_VALUE && - q.getMaxOutdegree() == Integer.MAX_VALUE && - q.getMaxRedirects() == Integer.MAX_VALUE && - q.getMaxTokens() == Integer.MAX_VALUE && - q.getMinCategories() == 0 && - q.getMinIndegree() == 0 && - q.getMinOutdegree() == 0 && - q.getMinRedirects() == 0 && - q.getMinTokens() == 0) { - pageIdList.add(pageID); - continue; - } - - Page page = null; - try { - page = wiki.getPage(pageID); - } catch (WikiPageNotFoundException e) { - logger.error("Page with pageID {} could not be found. Fatal error. Terminating.", pageID); - e.printStackTrace(); - System.exit(1); - } - - String[] tokens = page.getPlainText().split(" "); - - if (!(q.getMinIndegree() >= 0 && q.getMaxIndegree() >= 0 && q.getMinIndegree() <= q.getMaxIndegree())) { - q.setMinIndegree(0); - q.setMaxIndegree(Integer.MAX_VALUE); - } - - if (!(q.getMinOutdegree() >= 0 && q.getMaxOutdegree() >= 0 && q.getMinOutdegree() <= q.getMaxOutdegree())) { - q.setMinOutdegree(0); - q.setMaxOutdegree(Integer.MAX_VALUE); - } - - if (!(q.getMinRedirects() >= 0 && q.getMaxRedirects() >= 0 && q.getMinRedirects() <= q.getMaxRedirects())) { - q.setMinRedirects(0); - q.setMaxRedirects(Integer.MAX_VALUE); - } - - if (!(q.getMinCategories() >= 0 && q.getMaxCategories() >= 0 && q.getMinCategories() <= q.getMaxCategories())) { - q.setMinCategories(0); - q.setMaxCategories(Integer.MAX_VALUE); - } - - if (!(q.getMinCategories() >= 0 && q.getMaxCategories() >= 0 && q.getMinCategories() <= q.getMaxCategories())) { - q.setMinCategories(0); - q.setMaxCategories(Integer.MAX_VALUE); - } - - if (!(q.getMinTokens() >= 0 && q.getMaxTokens() >= 0 && q.getMinTokens() <= q.getMaxTokens())) { - q.setMinTokens(0); - q.setMaxTokens(Integer.MAX_VALUE); - } - - int inlinkSize = page.getNumberOfInlinks(); - if (inlinkSize < q.getMinIndegree() || inlinkSize > q.getMaxIndegree()) { - continue; - } - - int outlinkSize = page.getNumberOfOutlinks(); - if (outlinkSize < q.getMinOutdegree() || outlinkSize > q.getMaxOutdegree()) { - continue; - } - if (page.getRedirects().size() < q.getMinRedirects() || page.getRedirects().size() > q.getMaxRedirects()) { - continue; - } - - int categoriesSize = page.getCategories().size(); - if (categoriesSize < q.getMinCategories() || categoriesSize > q.getMaxCategories()) { - continue; - } - if (tokens.length < q.getMinTokens() || tokens.length > q.getMaxTokens()) { - continue; - } - - // if still here, add page - pageIdList.add(pageID); - } // for - logger.info("Query selected {} pages.", pageIdList.size()); - } - - @Override - public Iterator iterator() { - return new PageQueryIterator(wiki, pageIdList); - } } - - diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java index 7e177dec..d2143d6c 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageQueryIterator.java @@ -28,39 +28,47 @@ /** * An iterator over {@link Page} objects selected by a query. */ -public class PageQueryIterator implements Iterator { +public class PageQueryIterator + implements Iterator +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - private final Wikipedia wiki; - private int iterPosition; - private final List pageIDs; + private final Wikipedia wiki; + private int iterPosition; + private final List pageIDs; - public PageQueryIterator(Wikipedia wiki, List pPageIDs) { - this.wiki = wiki; - this.iterPosition = 0; - this.pageIDs = pPageIDs; - } + public PageQueryIterator(Wikipedia wiki, List pPageIDs) + { + this.wiki = wiki; + this.iterPosition = 0; + this.pageIDs = pPageIDs; + } - @Override - public boolean hasNext() { - return iterPosition < this.pageIDs.size(); - } + @Override + public boolean hasNext() + { + return iterPosition < this.pageIDs.size(); + } - @Override - public Page next() { - Page page = null; - try { - page = this.wiki.getPage(pageIDs.get(iterPosition)); - } catch (WikiApiException e) { - logger.error("Could not load page with id {}", pageIDs.get(iterPosition), e); + @Override + public Page next() + { + Page page = null; + try { + page = this.wiki.getPage(pageIDs.get(iterPosition)); + } + catch (WikiApiException e) { + logger.error("Could not load page with id {}", pageIDs.get(iterPosition), e); + } + iterPosition++; + return page; } - iterPosition++; - return page; - } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } + @Override + public void remove() + { + throw new UnsupportedOperationException(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java index 97048a38..71c0b999 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/PageTitleComparator.java @@ -24,16 +24,20 @@ /** * Compares two pages based on the lexicographic ordering of their titles. */ -public class PageTitleComparator implements Comparator { +public class PageTitleComparator + implements Comparator +{ - public int compare(Page o1, Page o2) { + public int compare(Page o1, Page o2) + { - int retVal = 0; - try { - retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); - } catch (WikiTitleParsingException e) { - e.printStackTrace(); + int retVal = 0; + try { + retVal = o1.getTitle().getPlainTitle().compareTo(o2.getTitle().getPlainTitle()); + } + catch (WikiTitleParsingException e) { + e.printStackTrace(); + } + return retVal; } - return retVal; - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java index a47dc1ac..db107daa 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Title.java @@ -28,147 +28,166 @@ *

* Title parsing regexp fixed with the help of many UKP colleagues and Samy Ateia. */ -public class Title { - - private static final Pattern PATTERN_NAMESPACE = Pattern.compile("(.*?)[ _]\\((.+?)\\)$"); - - private final String wikiStyleTitle; - private final String plainTitle; - private final String entity; - private final String disambiguationText; - private final String rawTitleText; - private final String sectionText; - - /** - * Create a {@link Title} object using a title string. - * The string gets parsed into an entity part and a disambiguation part. - * As Wikipedia page names represent spaces as underscores, we create a version with spaces and one without. - * - * @param titleText The title string of the page. - * @throws WikiTitleParsingException Thrown if errors occurred during sanitation of the {@code titleText}. - */ - public Title(String titleText) throws WikiTitleParsingException { - if (titleText.length() == 0) { - throw new WikiTitleParsingException("Title is empty."); +public class Title +{ + + private static final Pattern PATTERN_NAMESPACE = Pattern.compile("(.*?)[ _]\\((.+?)\\)$"); + + private final String wikiStyleTitle; + private final String plainTitle; + private final String entity; + private final String disambiguationText; + private final String rawTitleText; + private final String sectionText; + + /** + * Create a {@link Title} object using a title string. The string gets parsed into an entity + * part and a disambiguation part. As Wikipedia page names represent spaces as underscores, we + * create a version with spaces and one without. + * + * @param titleText + * The title string of the page. + * @throws WikiTitleParsingException + * Thrown if errors occurred during sanitation of the {@code titleText}. + */ + public Title(String titleText) throws WikiTitleParsingException + { + if (titleText.length() == 0) { + throw new WikiTitleParsingException("Title is empty."); + } + + /* + * Do not convert first character to upper case. We perform case insensitive querying + */ + if (titleText.substring(0, 1).toLowerCase().equals(titleText.substring(0, 1))) { + this.rawTitleText = titleText.substring(0, 1).toUpperCase() + + titleText.substring(1, titleText.length()); + } + else { + this.rawTitleText = titleText; + } + + // "Car_(automobile)#Introduction" + // should be split into: + // - "Car" + // - "automobile" + // - "Introduction" + + String titlePart; + String sectionPart = null; + if (rawTitleText.contains("#")) { + titlePart = rawTitleText.substring(0, rawTitleText.lastIndexOf("#")); + sectionPart = rawTitleText.substring(rawTitleText.lastIndexOf("#") + 1, + rawTitleText.length()); + } + else { + titlePart = rawTitleText; + } + + this.sectionText = sectionPart; + + Matcher matcherNamespace = PATTERN_NAMESPACE.matcher(this.decodeTitleWikistyle(titlePart)); + + // group 0 is the whole match + if (matcherNamespace.find()) { + this.entity = matcherNamespace.group(1); + this.disambiguationText = matcherNamespace.group(2); + + String relevantTitleParts = this.entity + " (" + this.disambiguationText + ")"; + this.plainTitle = decodeTitleWikistyle(relevantTitleParts); + this.wikiStyleTitle = encodeTitleWikistyle(relevantTitleParts); + } + else { + this.plainTitle = decodeTitleWikistyle(titlePart); + this.wikiStyleTitle = encodeTitleWikistyle(titlePart); + this.entity = this.plainTitle; + this.disambiguationText = null; + } + + if (StringUtils.isEmpty(getEntity())) { + throw new WikiTitleParsingException("Title was not properly initialized."); + } } - /* - * Do not convert first character to upper case. We perform case insensitive querying + /** + * Encodes a plain title string to wiki-style. + *

+ * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. Title + * strings entered by users normally do not conform to this wiki-style encoding. + * + * @param pTitle + * The string to encode. Must not be {@code null}. + * @return The wiki-style encoded string. */ - if (titleText.substring(0, 1).toLowerCase().equals(titleText.substring(0, 1))) { - this.rawTitleText = titleText.substring(0, 1).toUpperCase() + titleText.substring(1, titleText.length()); - } else { - this.rawTitleText = titleText; + private String encodeTitleWikistyle(String pTitle) + { + return pTitle.replace(' ', '_'); } - // "Car_(automobile)#Introduction" - // should be split into: - // - "Car" - // - "automobile" - // - "Introduction" - - String titlePart; - String sectionPart = null; - if (rawTitleText.contains("#")) { - titlePart = rawTitleText.substring(0, rawTitleText.lastIndexOf("#")); - sectionPart = rawTitleText.substring(rawTitleText.lastIndexOf("#") + 1, rawTitleText.length()); - } else { - titlePart = rawTitleText; + /** + * Decodes a wiki-style title string to plain text. + *

+ * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. Title + * strings entered by users normally do not conform to this wiki-style encoding. + * + * @param pTitle + * The string to decode. Must not be {@code null}. + * @return The decoded string. + */ + private String decodeTitleWikistyle(String pTitle) + { + return pTitle.replace('_', ' '); + } + + /** + * @return The disambiguation text of a page title (i.e., the part in parentheses following the + * page's name). + */ + public String getDisambiguationText() + { + return disambiguationText; + } + + /** + * @return The name of the entity (i.e. the page's title *without* disambiguation string). + */ + public String getEntity() + { + return entity; + } + + /** + * @return The plain title, without wikistyle underscores replacing spaces. + */ + public String getPlainTitle() + { + return plainTitle; + } + + /** + * @return Returns the section part of a link "Article (Disambiguation)#Section". + */ + public String getSectionText() + { + return sectionText; + } + + /** + * @return The wikistyle title, with spaces replaced by underscores. + */ + public String getWikiStyleTitle() + { + return wikiStyleTitle; } - this.sectionText = sectionPart; - - Matcher matcherNamespace = PATTERN_NAMESPACE.matcher( - this.decodeTitleWikistyle(titlePart) - ); - - // group 0 is the whole match - if (matcherNamespace.find()) { - this.entity = matcherNamespace.group(1); - this.disambiguationText = matcherNamespace.group(2); - - String relevantTitleParts = this.entity + " (" + this.disambiguationText + ")"; - this.plainTitle = decodeTitleWikistyle(relevantTitleParts); - this.wikiStyleTitle = encodeTitleWikistyle(relevantTitleParts); - } else { - this.plainTitle = decodeTitleWikistyle(titlePart); - this.wikiStyleTitle = encodeTitleWikistyle(titlePart); - this.entity = this.plainTitle; - this.disambiguationText = null; + protected String getRawTitleText() + { + return rawTitleText; } - if (StringUtils.isEmpty(getEntity())) { - throw new WikiTitleParsingException("Title was not properly initialized."); + @Override + public String toString() + { + return getPlainTitle(); } - } - - /** - * Encodes a plain title string to wiki-style. - *

- * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. - * Title strings entered by users normally do not conform to this wiki-style encoding. - * - * @param pTitle The string to encode. Must not be {@code null}. - * @return The wiki-style encoded string. - */ - private String encodeTitleWikistyle(String pTitle) { - return pTitle.replace(' ', '_'); - } - - /** - * Decodes a wiki-style title string to plain text. - *

- * Page titles in Wikipedia are encoded in a way that URLs containing the title are valid. - * Title strings entered by users normally do not conform to this wiki-style encoding. - * - * @param pTitle The string to decode. Must not be {@code null}. - * @return The decoded string. - */ - private String decodeTitleWikistyle(String pTitle) { - return pTitle.replace('_', ' '); - } - - /** - * @return The disambiguation text of a page title (i.e., the part in parentheses following the page's name). - */ - public String getDisambiguationText() { - return disambiguationText; - } - - /** - * @return The name of the entity (i.e. the page's title *without* disambiguation string). - */ - public String getEntity() { - return entity; - } - - /** - * @return The plain title, without wikistyle underscores replacing spaces. - */ - public String getPlainTitle() { - return plainTitle; - } - - /** - * @return Returns the section part of a link "Article (Disambiguation)#Section". - */ - public String getSectionText() { - return sectionText; - } - - /** - * @return The wikistyle title, with spaces replaced by underscores. - */ - public String getWikiStyleTitle() { - return wikiStyleTitle; - } - - protected String getRawTitleText() { - return rawTitleText; - } - - @Override - public String toString() { - return getPlainTitle(); - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java index 3a2ecf22..dc50df60 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterable.java @@ -22,32 +22,33 @@ /** * An {@link Iterable} over {@link Title} objects. */ -public class TitleIterable implements Iterable { - - private final Wikipedia wiki; - - /* - * The size of the title buffer. - * With bufferSize = 1, a database connection is needed for retrieving a single title. - * Higher bufferSize gives better performance, but needs memory. - * Initialize it with 5000. - */ - private int bufferSize = 5000; - - public TitleIterable(Wikipedia wiki) { - this.wiki = wiki; - } - - public TitleIterable(Wikipedia wiki, int bufferSize) { - this.wiki = wiki; - this.bufferSize = bufferSize; - } - - @Override - public Iterator<Title> iterator() { - return new TitleIterator(wiki, bufferSize); - } +public class TitleIterable + implements Iterable<Title> +{ + + private final Wikipedia wiki; + + /* + * The size of the title buffer. With bufferSize = 1, a database connection is needed for + * retrieving a single title. Higher bufferSize gives better performance, but needs memory. + * Initialize it with 5000. + */ + private int bufferSize = 5000; + + public TitleIterable(Wikipedia wiki) + { + this.wiki = wiki; + } + + public TitleIterable(Wikipedia wiki, int bufferSize) + { + this.wiki = wiki; + this.bufferSize = bufferSize; + } + + @Override + public Iterator<Title> iterator() + { + return new TitleIterator(wiki, bufferSize); + } } - - - diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java index 0a8a96a6..3ed4a87e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/TitleIterator.java @@ -27,121 +27,136 @@ /** * An {@link Iterator} over {@link Title} objects. */ -public class TitleIterator implements Iterator<Title> { - - private final TitleBuffer buffer; - - public TitleIterator(Wikipedia wiki, int bufferSize) { - buffer = new TitleBuffer(bufferSize, wiki); - } - - @Override - public boolean hasNext() { - return buffer.hasNext(); - } - - @Override - public Title next() { - return buffer.next(); - } - - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * Buffers titles in a list. - */ - static class TitleBuffer { - - private final Wikipedia wiki; - - private final List<String> titleStringBuffer; - private final int maxBufferSize; // the number of pages to be buffered after a query to the database. - private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements - private int bufferOffset; // the offset in the buffer - private int dataOffset; // the overall offset in the data - - public TitleBuffer(int bufferSize, Wikipedia wiki) { - this.maxBufferSize = bufferSize; - this.wiki = wiki; - this.titleStringBuffer = new ArrayList<>(); - this.bufferFillSize = 0; - this.bufferOffset = 0; - this.dataOffset = 0; - } +public class TitleIterator + implements Iterator<Title> +{ - /** - * If there are elements in the buffer left, then return true. - * If the end of the filled buffer is reached, then try to load new buffer. - * - * @return True, if there are pages left. False otherwise. - */ - public boolean hasNext() { - if (bufferOffset < bufferFillSize) { - return true; - } else { - return this.fillBuffer(); - } + private final TitleBuffer buffer; + + public TitleIterator(Wikipedia wiki, int bufferSize) + { + buffer = new TitleBuffer(bufferSize, wiki); } - /** - * @return The next Title or null if no more categories are available. - */ - public Title next() { - // if there are still elements in the buffer, just retrieve the next one - if (bufferOffset < bufferFillSize) { - return this.getBufferElement(); - } - // if there are no more elements => try to fill a new buffer - else if (this.fillBuffer()) { - return this.getBufferElement(); - } else { - // if it cannot be filled => return null - return null; - } + @Override + public boolean hasNext() + { + return buffer.hasNext(); } - private Title getBufferElement() { - String titleString = titleStringBuffer.get(bufferOffset); - Title title = null; - try { - title = new Title(titleString); - } catch (WikiTitleParsingException e) { - e.printStackTrace(); - } - bufferOffset++; - dataOffset++; - return title; + @Override + public Title next() + { + return buffer.next(); } - private boolean fillBuffer() { - - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - final String sql = "select p.name from PageMapLine as p"; - List<String> returnList = session.createNativeQuery(sql, String.class) - .setFirstResult(dataOffset) - .setMaxResults(maxBufferSize) - .setFetchSize(maxBufferSize) - .list(); - session.getTransaction().commit(); - - // clear the old buffer and all variables regarding the state of the buffer - titleStringBuffer.clear(); - bufferOffset = 0; - bufferFillSize = 0; - - titleStringBuffer.addAll(returnList); - - if (titleStringBuffer.size() > 0) { - bufferFillSize = titleStringBuffer.size(); - return true; - } else { - return false; - } + @Override + public void remove() + { + throw new UnsupportedOperationException(); } - } + /** + * Buffers titles in a list. + */ + static class TitleBuffer + { + + private final Wikipedia wiki; + + private final List<String> titleStringBuffer; + private final int maxBufferSize; // the number of pages to be buffered after a query to the + // database. + private int bufferFillSize; // even a 500 slot buffer can be filled with only 5 elements + private int bufferOffset; // the offset in the buffer + private int dataOffset; // the overall offset in the data + + public TitleBuffer(int bufferSize, Wikipedia wiki) + { + this.maxBufferSize = bufferSize; + this.wiki = wiki; + this.titleStringBuffer = new ArrayList<>(); + this.bufferFillSize = 0; + this.bufferOffset = 0; + this.dataOffset = 0; + } + + /** + * If there are elements in the buffer left, then return true. If the end of the filled + * buffer is reached, then try to load new buffer. + * + * @return True, if there are pages left. False otherwise. + */ + public boolean hasNext() + { + if (bufferOffset < bufferFillSize) { + return true; + } + else { + return this.fillBuffer(); + } + } + + /** + * @return The next Title or null if no more categories are available. + */ + public Title next() + { + // if there are still elements in the buffer, just retrieve the next one + if (bufferOffset < bufferFillSize) { + return this.getBufferElement(); + } + // if there are no more elements => try to fill a new buffer + else if (this.fillBuffer()) { + return this.getBufferElement(); + } + else { + // if it cannot be filled => return null + return null; + } + } + + private Title getBufferElement() + { + String titleString = titleStringBuffer.get(bufferOffset); + Title title = null; + try { + title = new Title(titleString); + } + catch (WikiTitleParsingException e) { + e.printStackTrace(); + } + bufferOffset++; + dataOffset++; + return title; + } + + private boolean fillBuffer() + { + + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + final String sql = "select p.name from PageMapLine as p"; + List<String> returnList = session.createNativeQuery(sql, String.class) + .setFirstResult(dataOffset).setMaxResults(maxBufferSize) + .setFetchSize(maxBufferSize).list(); + session.getTransaction().commit(); + + // clear the old buffer and all variables regarding the state of the buffer + titleStringBuffer.clear(); + bufferOffset = 0; + bufferFillSize = 0; + + titleStringBuffer.addAll(returnList); + + if (titleStringBuffer.size() > 0) { + bufferFillSize = titleStringBuffer.size(); + return true; + } + else { + return false; + } + } + + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java index b6ef2f57..b343772f 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikiConstants.java @@ -29,311 +29,88 @@ import com.neovisionaries.i18n.LanguageCode; -public interface WikiConstants { - /** - * Shortcut for System.getProperty("line.separator"). - */ - String LF = System.getProperty("line.separator"); - - /** - * The prefix that is added to page titles of discussion pages - * Has to be the same as in wikipedia.datamachine:SingleDumpVersionJDKGeneric - */ - String DISCUSSION_PREFIX = "Discussion:"; +public interface WikiConstants +{ + /** + * Shortcut for System.getProperty("line.separator"). + */ + String LF = System.getProperty("line.separator"); - /** - * Configuration file for the Sweble parser - */ - String SWEBLE_CONFIG = "classpath:/org/sweble/wikitext/engine/SimpleWikiConfiguration.xml"; + /** + * The prefix that is added to page titles of discussion pages Has to be the same as in + * wikipedia.datamachine:SingleDumpVersionJDKGeneric + */ + String DISCUSSION_PREFIX = "Discussion:"; - /** - * Enumerates the languages for which Wikipedia APIs are available. - * A Wikipedia object can be created using one of these languages. - */ - // Languages should be lowercase and match the corresponding snowball stemmer names. - enum Language { - abkhazian, - afar, - afrikaans, - akan, - albanian, - alemannic, - amharic, - anglo_saxon, - arabic, - aragonese, - armenian, - aromanian, - assamese, - assyrian_neo_aramaic, - asturian, - avar, - aymara, - azeri, - bambara, - banyumasan, - bashkir, - basque, - bavarian, - belarusian, - belarusian_tarashkevitsa, - bengali, - bihari, - bishnupriya_manipuri, - bislama, - bosnian, - breton, - buginese, - bulgarian, - burmese, - buryat_russia, - cantonese, - catalan, - cebuano, - central_bicolano, - chamorro, - chechen, - cherokee, - cheyenne, - chichewa, - chinese, - choctaw, - chuvash, - classical_chinese, - cornish, - corsican, - cree, - crimean_tatar, - croatian, - czech, - danish, - divehi, - dutch, - dutch_low_saxon, - dzongkha, - emilian_romagnol, - english, - esperanto, - estonian, - ewe, - faroese, - fijian, - finnish, - franco_provencal_arpitan, - french, - friulian, - fula, - galician, - georgian, - german, - gilaki, - gothic, - greek, - greenlandic, - guarani, - gujarati, - haitian, - hakka, - hausa, - hawaiian, - hebrew, - herero, - hindi, - hiri_motu, - hungarian, - icelandic, - ido, - igbo, - ilokano, - indonesian, - interlingua, - interlingue, - inuktitut, - inupiak, - irish, - italian, - japanese, - javanese, - kabyle, - kalmyk, - kannada, - kanuri, - kapampangan, - kashmiri, - kashubian, - kazakh, - khmer, - kikuyu, - kinyarwanda, - kirghiz, - kirundi, - klingon, - komi, - kongo, - korean, - kuanyama, - kurdish, - ladino, - lak, - lao, - latin, - latvian, - ligurian, - limburgian, - lingala, - lithuanian, - lojban, - lombard, - low_saxon, - lower_sorbian, - luganda, - luxembourgish, - macedonian, - malagasy, - malay, - malayalam, - maltese, - manx, - maori, - marathi, - marshallese, - mazandarani, - min_dong, - min_nan, - moldovan, - mongolian, - muscogee, - nahuatl, - nauruan, - navajo, - ndonga, - neapolitan, - nepali, - newar_nepal_bhasa, - norfolk, - norman, - northern_sami, - norwegian_bokmal, - norwegian_nynorsk, - novial, - occitan, - old_church_slavonic, - oriya, - oromo, - ossetian, - pali, - pangasinan, - papiamentu, - pashto, - pennsylvania_german, - persian, - piedmontese, - polish, - portuguese, - punjabi, - quechua, - ripuarian, - romani, - romanian, - romansh, - russian, - samoan, - samogitian, - sango, - sanskrit, - sardinian, - saterland_frisian, - scots, - scottish_gaelic, - serbian, - serbo_croatian, - sesotho, - shona, - sichuan_yi, - sicilian, - simple_english, - sindhi, - sinhalese, - slovak, - slovenian, - somali, - spanish, - sundanese, - swahili, - swati, - swedish, - tagalog, - tahitian, - tajik, - tamil, - tarantino, - tatar, - telugu, - tetum, - thai, - tibetan, - tigrinya, - tok_pisin, - tokipona, - tongan, - tsonga, - tswana, - tumbuka, - turkish, - turkmen, - twi, - udmurt, - ukrainian, - upper_sorbian, - urdu, - uyghur, - uzbek, - venda, - venetian, - vietnamese, - volapuek, - voro, - walloon, - waray_waray, - welsh, - west_flemish, - west_frisian, - wolof, - wu, - xhosa, - yiddish, - yoruba, - zamboanga_chavacano, - zazaki, - zealandic, - zhuang, - zulu, - _test; + /** + * Configuration file for the Sweble parser + */ + String SWEBLE_CONFIG = "classpath:/org/sweble/wikitext/engine/SimpleWikiConfiguration.xml"; /** - * Configures a language specific configuration for parsing wikipedia pages. - * - * @return WikiConfig + * Enumerates the languages for which Wikipedia APIs are available. A Wikipedia object can be + * created using one of these languages. */ - public WikiConfig getWikiconfig(Language this) { - WikiConfig config = DefaultConfigEnWp.generate(); - if (this != Language._test) { - // We need to capitalize the language name otherwise the locale lib cannot find it. - String langName = this.name().substring(0, 1).toUpperCase() + this.name().substring(1); - try { - List<LanguageCode> langCodes = LanguageCode.findByName(langName); - if (!langCodes.isEmpty()) { - String langCode = langCodes.get(0).name(); - return LanguageConfigGenerator.generateWikiConfig(langCode); - } - } catch (IOException | ParserConfigurationException | SAXException e) { - System.out.println( - String.format("Failed to create WikiConfig for language for %s, using default instead", - langName) - ); + // Languages should be lowercase and match the corresponding snowball stemmer names. + enum Language + { + abkhazian, afar, afrikaans, akan, albanian, alemannic, amharic, anglo_saxon, arabic, + aragonese, armenian, aromanian, assamese, assyrian_neo_aramaic, asturian, avar, aymara, + azeri, bambara, banyumasan, bashkir, basque, bavarian, belarusian, belarusian_tarashkevitsa, + bengali, bihari, bishnupriya_manipuri, bislama, bosnian, breton, buginese, bulgarian, + burmese, buryat_russia, cantonese, catalan, cebuano, central_bicolano, chamorro, chechen, + cherokee, cheyenne, chichewa, chinese, choctaw, chuvash, classical_chinese, cornish, + corsican, cree, crimean_tatar, croatian, czech, danish, divehi, dutch, dutch_low_saxon, + dzongkha, emilian_romagnol, english, esperanto, estonian, ewe, faroese, fijian, finnish, + franco_provencal_arpitan, french, friulian, fula, galician, georgian, german, gilaki, + gothic, greek, greenlandic, guarani, gujarati, haitian, hakka, hausa, hawaiian, hebrew, + herero, hindi, hiri_motu, hungarian, icelandic, ido, igbo, ilokano, indonesian, interlingua, + interlingue, inuktitut, inupiak, irish, italian, japanese, javanese, kabyle, kalmyk, + kannada, kanuri, kapampangan, kashmiri, kashubian, kazakh, khmer, kikuyu, kinyarwanda, + kirghiz, kirundi, klingon, komi, kongo, korean, kuanyama, kurdish, ladino, lak, lao, latin, + latvian, ligurian, limburgian, lingala, lithuanian, lojban, lombard, low_saxon, + lower_sorbian, luganda, luxembourgish, macedonian, malagasy, malay, malayalam, maltese, + manx, maori, marathi, marshallese, mazandarani, min_dong, min_nan, moldovan, mongolian, + muscogee, nahuatl, nauruan, navajo, ndonga, neapolitan, nepali, newar_nepal_bhasa, norfolk, + norman, northern_sami, norwegian_bokmal, norwegian_nynorsk, novial, occitan, + old_church_slavonic, oriya, oromo, ossetian, pali, pangasinan, papiamentu, pashto, + pennsylvania_german, persian, piedmontese, polish, portuguese, punjabi, quechua, ripuarian, + romani, romanian, romansh, russian, samoan, samogitian, sango, sanskrit, sardinian, + saterland_frisian, scots, scottish_gaelic, serbian, serbo_croatian, sesotho, shona, + sichuan_yi, sicilian, simple_english, sindhi, sinhalese, slovak, slovenian, somali, spanish, + sundanese, swahili, swati, swedish, tagalog, tahitian, tajik, tamil, tarantino, tatar, + telugu, tetum, thai, tibetan, tigrinya, tok_pisin, tokipona, tongan, tsonga, tswana, + tumbuka, turkish, turkmen, twi, udmurt, ukrainian, upper_sorbian, urdu, uyghur, uzbek, + venda, venetian, vietnamese, volapuek, voro, walloon, waray_waray, welsh, west_flemish, + west_frisian, wolof, wu, xhosa, yiddish, yoruba, zamboanga_chavacano, zazaki, zealandic, + zhuang, zulu, _test; + + /** + * Configures a language specific configuration for parsing wikipedia pages. + * + * @return WikiConfig + */ + public WikiConfig getWikiconfig(Language this) + { + WikiConfig config = DefaultConfigEnWp.generate(); + if (this != Language._test) { + // We need to capitalize the language name otherwise the locale lib cannot find it. + String langName = this.name().substring(0, 1).toUpperCase() + + this.name().substring(1); + try { + List<LanguageCode> langCodes = LanguageCode.findByName(langName); + if (!langCodes.isEmpty()) { + String langCode = langCodes.get(0).name(); + return LanguageConfigGenerator.generateWikiConfig(langCode); + } + } + catch (IOException | ParserConfigurationException | SAXException e) { + System.out.println(String.format( + "Failed to create WikiConfig for language for %s, using default instead", + langName)); + } + } + return config; } - } - return config; } - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java index 7521d50e..60adc2ee 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/Wikipedia.java @@ -45,771 +45,888 @@ * Provides access to Wikipedia articles and categories. */ // TODO better JavaDocs! -public class Wikipedia implements WikiConstants { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - // Note well: The whitespace at the beginning of this constant is here on purpose. Do NOT remove it! - static final String SQL_COLLATION = " COLLATE utf8mb4_bin"; /*" COLLATE utf8_bin";*/ - - private final Language language; - private final DatabaseConfiguration dbConfig; - - /* - * A mapping from page pageIDs to hibernateIDs. - * It is a kind of cache. It is only filled, if a pageID was previously accessed. - * The wikiapi startup time is way too long otherwise. - */ - private final Map<Integer, Long> idMapPages; - - /* - * A mapping from categories pageIDs to hibernateIDs. - * It is a kind of cache. It is only filled, if a pageID was previously accessed. - * The wikiapi startup time is way too long otherwise. - */ - private final Map<Integer, Long> idMapCategories; - - private final MetaData metaData; - - // Note: This should only be accessed internally. - private final WikiConfig wikiConfig; - - /** - * Creates a new {@link Wikipedia} object accessing the database indicated by the dbConfig parameter. - * - * @param dbConfig A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object - * where the data is stored and how it can be accessed. - * @throws WikiInitializationException Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance. - */ - public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException { - - logger.trace("Creating Wikipedia object."); - - this.language = dbConfig.getLanguage(); - this.dbConfig = dbConfig; - - this.idMapPages = new HashMap<>(); - this.idMapCategories = new HashMap<>(); - - this.metaData = new MetaData(this); - this.wikiConfig = this.language.getWikiconfig(); - - if(dbConfig.supportsCollation()) { - logger.info("Wikipedia database backend supports character collation features."); - } else { - logger.debug("Wikipedia database backend does NOT support character collation features."); - } - } - - WikiConfig getWikConfig() { - return wikiConfig; - } - - /** - * Gets the page with the given title. - * If the title is a redirect, the corresponding page is returned.<br> - * If the title start with a lowercase letter it converts it to an uppercase letter, as each Wikipedia article title starts with an uppercase letter. - * Spaces in the title are converted to underscores, as this is a convention for Wikipedia article titles. - * <p> - * For example, the article "Steam boat" could be queried with - * - "Steam boat" - * - "steam boat" - * - "Steam_boat" - * - "steam_boat" - * and additionally all redirects that might point to that article. - * - * @param title The title of the page. - * @return The page object for a given title. - * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. - */ - public Page getPage(String title) throws WikiApiException { - return new Page(this, title, false); - } - - /** - * Gets the page with exactly the given title.<br> - * - * Note that when using this method you are responsible for converting a normal search string into the right wiki-style.<br> - * - * If the title is a redirect, the corresponding page is returned.<br> - * - * @param exactTitle The exact title of the page. - * @return The page object for a given title. - * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. - */ - public Page getPageByExactTitle(String exactTitle) throws WikiApiException { - return new Page(this, exactTitle, true); - } - - /** - * Get all pages which match all lowercase/uppercase version of the given title.<br> - * If the title is a redirect, the corresponding page is returned.<br> - * Spaces in the title are converted to underscores, as this is a convention for Wikipedia article titles. - * - * @param title The title of the page. - * @return A set of page objects matching this title. - * @throws WikiApiException If no page or redirect with this title exists or the title could not be properly parsed. - */ - public Set<Page> getPages(String title) throws WikiApiException { - Set<Integer> ids = new HashSet<>(getPageIdsCaseInsensitive(title)); - - Set<Page> pages = new HashSet<>(); - for (Integer id : ids) { - pages.add(new Page(this, id)); - } - return pages; - } - - /** - * Gets the page for a given pageId. - * - * @param pageId The id of the page. - * @return The page object for a given pageId. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page getPage(int pageId) throws WikiApiException { - return new Page(this, pageId); - } - - /** - * Gets the title for a given pageId. - * - * @param pageId The id of the page. - * @return The title for the given pageId. - * @throws WikiApiException Thrown if errors occurred. - */ - public Title getTitle(int pageId) throws WikiApiException { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.name from PageMapLine as p where p.pageId= :pId"; - String returnValue = session.createNativeQuery(sql, String.class) - .setParameter("pId", pageId, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - - if(returnValue == null){ - throw new WikiPageNotFoundException(); - } - return new Title(returnValue); - } - - /** - * Gets the page ids for a given title. - * - * @param title The title of the page. - * @return The id for the page with the given title. - * @throws WikiApiException Thrown if errors occurred. - */ - public List<Integer> getPageIds(String title) throws WikiApiException { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.pageID from PageMapLine as p where p.name = :pName"; - Iterator<Integer> results = session.createQuery(sql, Integer.class) - .setParameter("pName", title, StandardBasicTypes.STRING) - .list().iterator(); - - session.getTransaction().commit(); - - if(!results.hasNext()){ - throw new WikiPageNotFoundException(); - } - List<Integer> resultList = new LinkedList<>(); - while(results.hasNext()){ - resultList.add(results.next()); - } - return resultList; - } - - /** - * Gets the page ids for a given title with case insensitive matching.<br> - * - * @param title The title of the page. - * @return The ids of the pages with the given title. - * @throws WikiApiException Thrown if errors occurred. - */ - public List<Integer> getPageIdsCaseInsensitive(String title) throws WikiApiException { - title = title.toLowerCase(); - title = title.replaceAll(" ", "_"); - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.pageID from PageMapLine as p where lower(p.name) = :pName"; - Iterator<Integer> results = session.createQuery(sql, Integer.class) - .setParameter("pName", title, StandardBasicTypes.STRING) - .list().iterator(); - - session.getTransaction().commit(); - - if(!results.hasNext()){ - throw new WikiPageNotFoundException(); - } - List<Integer> resultList = new LinkedList<>(); - while(results.hasNext()){ - resultList.add(results.next()); - } - return resultList; - } - - /** - * Returns the article page for a given discussion page. - * - * @param discussionPage - * the discussion page object - * @return The page object of the article associated with the discussion. If - * the parameter already was an article, it is returned directly. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException { - if(discussionPage.isDiscussion()){ - String title = discussionPage.getTitle().getPlainTitle().replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); - - if(title.contains("/")){ - //If we have a discussion archive - //TODO This does not support articles that contain slashes- - //However, the rest of the API cannot cope with that as well, so this should not be any extra trouble - title = title.split("/")[0]; - } - return getPage(title); - }else{ - return discussionPage; - } - - } - - /** - * Gets the discussion page for an article page with the given pageId. - * - * @param articlePageId The id of the page. - * @return The page object for a given pageId. - * @throws WikiApiException Thrown if errors occurred. - */ - public Page getDiscussionPage(int articlePageId) throws WikiApiException { - // Retrieve discussion page with article title - //TODO not the prettiest solution, but currently discussions are only marked in the title - return getDiscussionPage(getPage(articlePageId)); - } - - /** - * Gets the discussion page for the page with the given title. - * The page retrieval works as defined in {@link #getPage(String title)} - * - * @param title The title of the page for which the discussions should be retrieved. - * @return The page object for the discussion page. - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Page getDiscussionPage(String title) throws WikiApiException { - return getDiscussionPage(getPage(title)); - } - - /** - * Gets the discussion page for the given article page - * The provided page must not be a discussion page - * - * @param articlePage the article page for which a discussion page should be retrieved - * @return The discussion page object for the given article page object - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Page getDiscussionPage(Page articlePage) throws WikiApiException{ - String articleTitle = articlePage.getTitle().toString(); - if(articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ - return articlePage; - }else{ - return new Page(this, WikiConstants.DISCUSSION_PREFIX+articleTitle); - } - } - - - /** - * Returns an iterable containing all archived discussion pages for - * the page with the given title String. <br> - * The page retrieval works as defined in {@link #getPage(int)}. <br> - * The most recent discussion page is NOT included here! - * It can be obtained with {@link #getDiscussionPage(Page)}. - * - * @param articlePageId The id of the page for which to the the discussion archives - * @return The page object for the discussion page. - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Iterable<Page> getDiscussionArchives(int articlePageId) throws WikiApiException { - //Retrieve discussion archive pages with page id - return getDiscussionArchives(getPage(articlePageId)); - } - - /** - * Returns an iterable containing all archived discussion pages for - * the page with the given title String. <br> - * The page retrieval works as defined in {@link #getPage(String title)}.<br> - * The most recent discussion page is NOT included here! - * It can be obtained with {@link #getDiscussionPage(Page)}. - * - * @param title The title of the page for which the discussions should be retrieved. - * @return The page object for the discussion page. - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - * @deprecated Use {@link #getDiscussionArchives(int)} or {@link #getDiscussionArchives(Page)} instead. - */ - @Deprecated(since="2.0.0", forRemoval=true) - public Iterable<Page> getDiscussionArchives(String title) throws WikiApiException { - //Retrieve discussion archive pages with page title - return getDiscussionArchives(getPage(title)); - } - - /** - * Return an iterable containing all archived discussion pages for - * the given article page. The most recent discussion page is not included. - * The most recent discussion page can be obtained with {@link #getDiscussionPage(Page)}. - * <br> - * The provided page Object must not be a discussion page itself! If it is - * a discussion page, is returned unchanged. - * - * @param articlePage the article page for which a discussion archives should be retrieved - * @return An iterable with the discussion archive page objects for the given article page object - * @throws WikiApiException If no page or redirect with this title exists or title could not be properly parsed. - */ - public Iterable<Page> getDiscussionArchives(Page articlePage) throws WikiApiException { - String articleTitle = articlePage.getTitle().getWikiStyleTitle(); - if(!articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)){ - articleTitle=WikiConstants.DISCUSSION_PREFIX+articleTitle; - } - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - - List<Page> discussionArchives = new LinkedList<>(); - - String sql = "SELECT pageID FROM PageMapLine where name like :name"; - Iterator<Integer> results = session.createQuery(sql, Integer.class) - .setParameter("name", articleTitle+"/%", StandardBasicTypes.STRING) - .list().iterator(); - - session.getTransaction().commit(); - - while (results.hasNext()) { - int pageID = results.next(); - discussionArchives.add(getPage(pageID)); - } - return discussionArchives; - } - - /** - * Gets the pages or redirects with a name similar to the pattern. - * Calling this method is quite costly, as similarity is computed for all names. - * @param pPattern The pattern. - * @param pSize The maximum size of the result list. Only the most similar results will be included. - * @return A map of pages with names similar to the pattern and their distance values. Smaller distances are more similar. - * @throws WikiApiException Thrown if errors occurred. - */ - //// I do not want to make this public at the moment (TZ, March, 2007) - protected Map<Page, Double> getSimilarPages(String pPattern, int pSize) throws WikiApiException { - Title title = new Title(pPattern); - String pattern = title.getWikiStyleTitle(); - - // a mapping of the most similar pages and their similarity values - // It is returned by this method. - Map<Page, Double> pageMap = new HashMap<>(); - - // holds a mapping of the best distance values to page IDs - Map<Integer, Double> distanceMap = new HashMap<>(); - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml").list()) { - Object[] row = (Object[]) o; - int pageID = (Integer) row[0]; - String pageName = (String) row[1]; - - // this returns a similarity - if we want to use it, we have to change the semantics the ordering of the results - // double distance = new Levenshtein().getSimilarity(pageName, pPattern); - double distance = new LevenshteinStringDistance().distance(pageName, pattern); - - distanceMap.put(pageID, distance); - - // if there are more than "pSize" entries in the map remove the last one (it has the biggest distance) - if (distanceMap.size() > pSize) { - Set<Entry<Integer, Double>> valueSortedSet = new TreeSet<>(new ValueComparator()); - valueSortedSet.addAll(distanceMap.entrySet()); - Iterator<Entry<Integer, Double>> it = valueSortedSet.iterator(); - // remove the first element - if (it.hasNext()) { - // get the id of this entry and remove it in the distanceMap - distanceMap.remove(it.next().getKey()); +public class Wikipedia + implements WikiConstants +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + // Note well: The whitespace at the beginning of this constant is here on purpose. Do NOT remove + // it! + static final String SQL_COLLATION = " COLLATE utf8mb4_bin"; /* " COLLATE utf8_bin"; */ + + private final Language language; + private final DatabaseConfiguration dbConfig; + + /* + * A mapping from page pageIDs to hibernateIDs. It is a kind of cache. It is only filled, if a + * pageID was previously accessed. The wikiapi startup time is way too long otherwise. + */ + private final Map<Integer, Long> idMapPages; + + /* + * A mapping from categories pageIDs to hibernateIDs. It is a kind of cache. It is only filled, + * if a pageID was previously accessed. The wikiapi startup time is way too long otherwise. + */ + private final Map<Integer, Long> idMapCategories; + + private final MetaData metaData; + + // Note: This should only be accessed internally. + private final WikiConfig wikiConfig; + + /** + * Creates a new {@link Wikipedia} object accessing the database indicated by the dbConfig + * parameter. + * + * @param dbConfig + * A {@link DatabaseConfiguration} object telling the {@link Wikipedia} object where + * the data is stored and how it can be accessed. + * @throws WikiInitializationException + * Thrown if errors occurred while bootstrapping the {@link Wikipedia} instance. + */ + public Wikipedia(DatabaseConfiguration dbConfig) throws WikiInitializationException + { + + logger.trace("Creating Wikipedia object."); + + this.language = dbConfig.getLanguage(); + this.dbConfig = dbConfig; + + this.idMapPages = new HashMap<>(); + this.idMapCategories = new HashMap<>(); + + this.metaData = new MetaData(this); + this.wikiConfig = this.language.getWikiconfig(); + + if (dbConfig.supportsCollation()) { + logger.info("Wikipedia database backend supports character collation features."); } - } - } - session.getTransaction().commit(); - - for (int pageID : distanceMap.keySet()) { - Page page = null; - try { - page = this.getPage(pageID); - } catch (WikiPageNotFoundException e) { - logger.error("Page with pageID " + pageID + " could not be found. Fatal error. Terminating."); - e.printStackTrace(); - System.exit(1); - } - pageMap.put(page, distanceMap.get(pageID)); - } - - return pageMap; - } - - /** - * Gets the category for a given title. - * If the {@link Category} title start with a lowercase letter it converts it to an uppercase letter, - * as each Wikipedia category title starts with an uppercase letter. Spaces in the title are converted to - * underscores, as this is a convention for Wikipedia category titles. - * <p> - * For example, the (possible) category "Famous steamboats" could be queried with - * - "Famous steamboats" - * - "Famous_steamboats" - * - "famous steamboats" - * - "famous_steamboats" - * @param title The title of the category. - * @return The category object with the given title. - * @throws WikiApiException If no category with the given title exists. - */ - public Category getCategory(String title) throws WikiApiException { - return new Category(this, title); - } - - /** - * Gets the category for a given pageId. - * @param pageId The id of the {@link Category}. - * @return The category object or null if no category with this pageId exists. - */ - public Category getCategory(int pageId) { - long hibernateId = __getCategoryHibernateId(pageId); - if (hibernateId == -1) { - return null; - } - - try { - return new Category(this, hibernateId); - } catch (WikiPageNotFoundException e) { - return null; - } - } - - /** - * This returns an iterable over all {@link Category categories}, as returning all category objects would be much too expensive. - * @return An iterable over all categories. - */ - public Iterable<Category> getCategories() { - return new CategoryIterable(this); - } - - /** - * Gets the {@link Category categories} for a given {@link Page} identified by its {@code pageTitle}. - * @param pageTitle The title of a {@link Page}, not a category. - * @return The category objects which are associated with the given {@code pageTitle}. - * @throws WikiPageNotFoundException Thrown if no {@link Page} exists for the given {@code pageTitle}. - */ - public Set<Category> getCategories(String pageTitle) throws WikiPageNotFoundException { - if (pageTitle == null || pageTitle.length() == 0) { - throw new WikiPageNotFoundException(); - } - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select c from Page p left join p.categories c where p.name = :pageTitle"; - List<Integer> categoryHibernateIds = session.createQuery(sql, Integer.class) - .setParameter("pageTitle", pageTitle).list(); - session.getTransaction().commit(); - - Set<Category> categorySet = new HashSet<>(categoryHibernateIds.size()); - for (int hibernateId : categoryHibernateIds) { - try { - categorySet.add(new Category(this, hibernateId)); - } catch (WikiPageNotFoundException e) { - logger.warn("Could not load Category by it's HibernateId = '"+hibernateId+"'"); - } - } - return categorySet; - } - - /** - * Get all wikipedia {@link Category categories}. - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @param bufferSize The size of the internal page buffer. - * @return An iterable over all categories. - */ - protected Iterable<Category> getCategories(int bufferSize) { - return new CategoryIterable(this, bufferSize); - } - - - /** - * Protected method that is much faster than the public version, but exposes too much implementation details. - * Get a set with all category pageIDs. Returning all category objects is much too expensive. - * @return A set with all category pageIDs - */ - // TODO this should be replaced with the buffered category iterator, as it might produce an HeapSpace Overflow, if there are too many categories. - protected Set<Integer> __getCategories() { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select cat.pageId from Category as cat"; - List<Integer> idList = session.createQuery(sql, Integer.class).list(); - session.getTransaction().commit(); - - return new HashSet<>(idList); - } - - /** - * Get all wikipedia pages. - * Does not include redirects, as they are only pointers to real pages. - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @return An iterable over all pages. - */ - public Iterable<Page> getPages() { - return new PageIterable(this, false); - } - - /** - * Get all wikipedia pages. - * Does not include redirects, as they are only pointers to real pages. - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @param bufferSize The size of the internal page buffer. - * @return An iterable over all pages. - */ - protected Iterable<Page> getPages(int bufferSize) { - return new PageIterable(this, false, bufferSize); - } - - /** - * Protected method that is much faster than the public version, but exposes too much implementation details. - * Get a set with all {@code pageIDs}. Returning all page objects is much too expensive. - * Does not include redirects, as they are only pointers to real pages. - * <p> - * As ids can be useful for several application (e.g. in combination with - * the RevisionMachine, they have been made publicly available via - * {@link #getPageIds()}. - * - * @return A set with all {@code pageIDs}. Returning all pages is much to expensive. - */ - protected Set<Integer> __getPages() { - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select page.pageId from Page as page"; - List<Integer> idList = session.createQuery(sql, Integer.class).list(); - session.getTransaction().commit(); - - return new HashSet<>(idList); - } - - /** - * @return an iterable over all {@code pageIDs} (without redirects) - */ - public Iterable<Integer> getPageIds(){ - return this.__getPages(); - } - - /** - * Get the pages that match the given query. - * Does not include redirects, as they are only pointers to real pages. - * Attention: may be running very slow, depending on the size of the Wikipedia! - * @param query A query object containing the query conditions. - * @return A set of pages that match the given query. - * @throws WikiApiException Thrown if errors occurred. - */ - public Iterable<Page> getPages(PageQuery query) throws WikiApiException { - return new PageQueryIterable(this, query); - } - - - /** - * Get all articles (pages MINUS disambiguationPages MINUS redirects). - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @return An iterable of all article pages. - */ - public Iterable<Page> getArticles() { - return new PageIterable(this, true); - } - - /** - * Get all titles including disambiguation pages and redirects). - * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. - * @return An iterable of all article pages. - */ - public Iterable<Title> getTitles() { - return new TitleIterable(this); - } - - /** - * @return The {@link Language} of this Wikipedia. - */ - public Language getLanguage() { - return this.language; - } - - /** - * Tests, whether a page or redirect with the given title exists. - * Trying to retrieve a page that does not exist in Wikipedia throws an exception. - * You may catch the exception or use this test, depending on your task. - * @param title The title of the page. - * @return {@code True}, if a page or redirect with that title exits, {@code false} otherwise. - */ - public boolean existsPage(String title) { - - if (title == null || title.isEmpty()) { - return false; - } - Title t; - try { - t = new Title(title); - } catch (WikiTitleParsingException e) { - return false; - } - String encodedTitle = t.getWikiStyleTitle(); - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String query = "select p.id from PageMapLine as p where p.name = :pName"; - if(dbConfig.supportsCollation()) { - query += SQL_COLLATION; - } - Object returnValue = session.createNativeQuery(query) - .setParameter("pName", encodedTitle, StandardBasicTypes.STRING) - .uniqueResult(); - session.getTransaction().commit(); - - return returnValue != null; - } - - /** - * Tests, whether a page with the given pageID exists. - * Trying to retrieve a pageID that does not exist in Wikipedia throws an exception. - * - * @param pageID A pageID. - * @return {@code True}, if a page with that pageID exits, {@code false} otherwise. - */ - public boolean existsPage(int pageID) { - - // This is a hack to provide a much quicker way to test whether a page exists. - // Encoding the title in this way surpasses the normal way of creating a title first. - // Anyway, I do not like this hack :-| - if (pageID < 0) { - return false; - } - - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select p.id from PageMapLine as p where p.pageID = :pageId"; - Long returnValue = session.createNativeQuery(sql, Long.class) - .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - - return returnValue != null; - } - - /** - * Get the hibernate ID to a given pageID of a page. - * We need different methods for pages and categories here, as a page and a category can have the same ID. - * - * @param pageID A pageID that should be mapped to the corresponding hibernate ID. - * @return The hibernateID of the page with pageID or -1, if the pageID is not valid - */ - protected long __getPageHibernateId(int pageID) { - long hibernateID = -1; - - // first look in the id mapping cache - if (idMapPages.containsKey(pageID)) { - return idMapPages.get(pageID); - } - - // The id was not found in the id mapping cache. - // It may not be in the cahe or may not exist at all. - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select page.id from Page as page where page.pageId = :pageId"; - Long retObjectPage = session.createQuery(sql, Long.class) - .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - if (retObjectPage != null) { - hibernateID = retObjectPage; - // add it to the cache - idMapPages.put(pageID, hibernateID); - return hibernateID; - } - - return hibernateID; - } - - /** - * Get the hibernate ID to a given pageID of a category. - * We need different methods for pages and categories here, as a page and a category can have the same ID. - * - * @param pageID A pageID that should be mapped to the corresponding hibernate ID. - * @return The hibernateID of the page with pageID or -1, if the pageID is not valid - */ - protected long __getCategoryHibernateId(int pageID) { - long hibernateID = -1; - - // first look in the id mapping cache - if (idMapCategories.containsKey(pageID)) { - return idMapCategories.get(pageID); - } - - // The id was not found in the id mapping cache. - // It may not be in the cahe or may not exist at all. - Session session = this.__getHibernateSession(); - session.beginTransaction(); - String sql = "select cat.id from Category as cat where cat.pageId = :pageId"; - Long retObjectPage = session.createQuery(sql, Long.class) - .setParameter("pageId", pageID, StandardBasicTypes.INTEGER) - .uniqueResult(); - session.getTransaction().commit(); - if (retObjectPage != null) { - hibernateID = retObjectPage; - // add it to the cache - idMapCategories.put(pageID, hibernateID); - } - - return hibernateID; - } - - /** - * @return A {@link MetaData} object containing all meta data about this instance of Wikipedia. - */ - public MetaData getMetaData() { - return this.metaData; - } - - /** - * @return The {@link DatabaseConfiguration} object that was used to create the Wikipedia object. - */ - public DatabaseConfiguration getDatabaseConfiguration() { - return this.dbConfig; - } - - /** - * @return Shortcut for getting a hibernate session. - */ - protected Session __getHibernateSession() { - return WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); - } - - /** - * The ID consists of the host, the database, and the language. - * This should be unique in most cases. - * @return Returns a unique ID for this Wikipedia object. - */ - public String getWikipediaId() { - StringBuilder sb = new StringBuilder(); - sb.append(this.getDatabaseConfiguration().getHost()); - sb.append("_"); - sb.append(this.getDatabaseConfiguration().getDatabase()); - sb.append("_"); - sb.append(this.getDatabaseConfiguration().getLanguage()); - return sb.toString(); - } + else { + logger.debug( + "Wikipedia database backend does NOT support character collation features."); + } + } + + WikiConfig getWikConfig() + { + return wikiConfig; + } + + /** + * Gets the page with the given title. If the title is a redirect, the corresponding page is + * returned.<br> + * If the title start with a lowercase letter it converts it to an uppercase letter, as each + * Wikipedia article title starts with an uppercase letter. Spaces in the title are converted to + * underscores, as this is a convention for Wikipedia article titles. + * <p> + * For example, the article "Steam boat" could be queried with - "Steam boat" - "steam boat" - + * "Steam_boat" - "steam_boat" and additionally all redirects that might point to that article. + * + * @param title + * The title of the page. + * @return The page object for a given title. + * @throws WikiApiException + * If no page or redirect with this title exists or the title could not be properly + * parsed. + */ + public Page getPage(String title) throws WikiApiException + { + return new Page(this, title, false); + } + + /** + * Gets the page with exactly the given title.<br> + * + * Note that when using this method you are responsible for converting a normal search string + * into the right wiki-style.<br> + * + * If the title is a redirect, the corresponding page is returned.<br> + * + * @param exactTitle + * The exact title of the page. + * @return The page object for a given title. + * @throws WikiApiException + * If no page or redirect with this title exists or the title could not be properly + * parsed. + */ + public Page getPageByExactTitle(String exactTitle) throws WikiApiException + { + return new Page(this, exactTitle, true); + } + + /** + * Get all pages which match all lowercase/uppercase version of the given title.<br> + * If the title is a redirect, the corresponding page is returned.<br> + * Spaces in the title are converted to underscores, as this is a convention for Wikipedia + * article titles. + * + * @param title + * The title of the page. + * @return A set of page objects matching this title. + * @throws WikiApiException + * If no page or redirect with this title exists or the title could not be properly + * parsed. + */ + public Set<Page> getPages(String title) throws WikiApiException + { + Set<Integer> ids = new HashSet<>(getPageIdsCaseInsensitive(title)); + + Set<Page> pages = new HashSet<>(); + for (Integer id : ids) { + pages.add(new Page(this, id)); + } + return pages; + } + + /** + * Gets the page for a given pageId. + * + * @param pageId + * The id of the page. + * @return The page object for a given pageId. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Page getPage(int pageId) throws WikiApiException + { + return new Page(this, pageId); + } + + /** + * Gets the title for a given pageId. + * + * @param pageId + * The id of the page. + * @return The title for the given pageId. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Title getTitle(int pageId) throws WikiApiException + { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.name from PageMapLine as p where p.pageId= :pId"; + String returnValue = session.createNativeQuery(sql, String.class) + .setParameter("pId", pageId, StandardBasicTypes.INTEGER).uniqueResult(); + session.getTransaction().commit(); + + if (returnValue == null) { + throw new WikiPageNotFoundException(); + } + return new Title(returnValue); + } + + /** + * Gets the page ids for a given title. + * + * @param title + * The title of the page. + * @return The id for the page with the given title. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public List<Integer> getPageIds(String title) throws WikiApiException + { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.pageID from PageMapLine as p where p.name = :pName"; + Iterator<Integer> results = session.createQuery(sql, Integer.class) + .setParameter("pName", title, StandardBasicTypes.STRING).list().iterator(); + + session.getTransaction().commit(); + + if (!results.hasNext()) { + throw new WikiPageNotFoundException(); + } + List<Integer> resultList = new LinkedList<>(); + while (results.hasNext()) { + resultList.add(results.next()); + } + return resultList; + } + + /** + * Gets the page ids for a given title with case insensitive matching.<br> + * + * @param title + * The title of the page. + * @return The ids of the pages with the given title. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public List<Integer> getPageIdsCaseInsensitive(String title) throws WikiApiException + { + title = title.toLowerCase(); + title = title.replaceAll(" ", "_"); + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.pageID from PageMapLine as p where lower(p.name) = :pName"; + Iterator<Integer> results = session.createQuery(sql, Integer.class) + .setParameter("pName", title, StandardBasicTypes.STRING).list().iterator(); + + session.getTransaction().commit(); + + if (!results.hasNext()) { + throw new WikiPageNotFoundException(); + } + List<Integer> resultList = new LinkedList<>(); + while (results.hasNext()) { + resultList.add(results.next()); + } + return resultList; + } + + /** + * Returns the article page for a given discussion page. + * + * @param discussionPage + * the discussion page object + * @return The page object of the article associated with the discussion. If the parameter + * already was an article, it is returned directly. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Page getArticleForDiscussionPage(Page discussionPage) throws WikiApiException + { + if (discussionPage.isDiscussion()) { + String title = discussionPage.getTitle().getPlainTitle() + .replaceAll(WikiConstants.DISCUSSION_PREFIX, ""); + + if (title.contains("/")) { + // If we have a discussion archive + // TODO This does not support articles that contain slashes- + // However, the rest of the API cannot cope with that as well, so this should not be + // any extra trouble + title = title.split("/")[0]; + } + return getPage(title); + } + else { + return discussionPage; + } + + } + + /** + * Gets the discussion page for an article page with the given pageId. + * + * @param articlePageId + * The id of the page. + * @return The page object for a given pageId. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Page getDiscussionPage(int articlePageId) throws WikiApiException + { + // Retrieve discussion page with article title + // TODO not the prettiest solution, but currently discussions are only marked in the title + return getDiscussionPage(getPage(articlePageId)); + } + + /** + * Gets the discussion page for the page with the given title. The page retrieval works as + * defined in {@link #getPage(String title)} + * + * @param title + * The title of the page for which the discussions should be retrieved. + * @return The page object for the discussion page. + * @throws WikiApiException + * If no page or redirect with this title exists or title could not be properly + * parsed. + */ + public Page getDiscussionPage(String title) throws WikiApiException + { + return getDiscussionPage(getPage(title)); + } + + /** + * Gets the discussion page for the given article page The provided page must not be a + * discussion page + * + * @param articlePage + * the article page for which a discussion page should be retrieved + * @return The discussion page object for the given article page object + * @throws WikiApiException + * If no page or redirect with this title exists or title could not be properly + * parsed. + */ + public Page getDiscussionPage(Page articlePage) throws WikiApiException + { + String articleTitle = articlePage.getTitle().toString(); + if (articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) { + return articlePage; + } + else { + return new Page(this, WikiConstants.DISCUSSION_PREFIX + articleTitle); + } + } + + /** + * Returns an iterable containing all archived discussion pages for the page with the given + * title String. <br> + * The page retrieval works as defined in {@link #getPage(int)}. <br> + * The most recent discussion page is NOT included here! It can be obtained with + * {@link #getDiscussionPage(Page)}. + * + * @param articlePageId + * The id of the page for which to the the discussion archives + * @return The page object for the discussion page. + * @throws WikiApiException + * If no page or redirect with this title exists or title could not be properly + * parsed. + */ + public Iterable<Page> getDiscussionArchives(int articlePageId) throws WikiApiException + { + // Retrieve discussion archive pages with page id + return getDiscussionArchives(getPage(articlePageId)); + } + + /** + * Returns an iterable containing all archived discussion pages for the page with the given + * title String. <br> + * The page retrieval works as defined in {@link #getPage(String title)}.<br> + * The most recent discussion page is NOT included here! It can be obtained with + * {@link #getDiscussionPage(Page)}. + * + * @param title + * The title of the page for which the discussions should be retrieved. + * @return The page object for the discussion page. + * @throws WikiApiException + * If no page or redirect with this title exists or title could not be properly + * parsed. + * @deprecated Use {@link #getDiscussionArchives(int)} or {@link #getDiscussionArchives(Page)} + * instead. + */ + @Deprecated(since = "2.0.0", forRemoval = true) + public Iterable<Page> getDiscussionArchives(String title) throws WikiApiException + { + // Retrieve discussion archive pages with page title + return getDiscussionArchives(getPage(title)); + } + + /** + * Return an iterable containing all archived discussion pages for the given article page. The + * most recent discussion page is not included. The most recent discussion page can be obtained + * with {@link #getDiscussionPage(Page)}. <br> + * The provided page Object must not be a discussion page itself! If it is a discussion page, is + * returned unchanged. + * + * @param articlePage + * the article page for which a discussion archives should be retrieved + * @return An iterable with the discussion archive page objects for the given article page + * object + * @throws WikiApiException + * If no page or redirect with this title exists or title could not be properly + * parsed. + */ + public Iterable<Page> getDiscussionArchives(Page articlePage) throws WikiApiException + { + String articleTitle = articlePage.getTitle().getWikiStyleTitle(); + if (!articleTitle.startsWith(WikiConstants.DISCUSSION_PREFIX)) { + articleTitle = WikiConstants.DISCUSSION_PREFIX + articleTitle; + } + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + + List<Page> discussionArchives = new LinkedList<>(); + + String sql = "SELECT pageID FROM PageMapLine where name like :name"; + Iterator<Integer> results = session.createQuery(sql, Integer.class) + .setParameter("name", articleTitle + "/%", StandardBasicTypes.STRING).list() + .iterator(); + + session.getTransaction().commit(); + + while (results.hasNext()) { + int pageID = results.next(); + discussionArchives.add(getPage(pageID)); + } + return discussionArchives; + } + + /** + * Gets the pages or redirects with a name similar to the pattern. Calling this method is quite + * costly, as similarity is computed for all names. + * + * @param pPattern + * The pattern. + * @param pSize + * The maximum size of the result list. Only the most similar results will be + * included. + * @return A map of pages with names similar to the pattern and their distance values. Smaller + * distances are more similar. + * @throws WikiApiException + * Thrown if errors occurred. + */ + //// I do not want to make this public at the moment (TZ, March, 2007) + protected Map<Page, Double> getSimilarPages(String pPattern, int pSize) throws WikiApiException + { + Title title = new Title(pPattern); + String pattern = title.getWikiStyleTitle(); + + // a mapping of the most similar pages and their similarity values + // It is returned by this method. + Map<Page, Double> pageMap = new HashMap<>(); + + // holds a mapping of the best distance values to page IDs + Map<Integer, Double> distanceMap = new HashMap<>(); + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select pml.pageID, pml.name from PageMapLine as pml") + .list()) { + Object[] row = (Object[]) o; + int pageID = (Integer) row[0]; + String pageName = (String) row[1]; + + // this returns a similarity - if we want to use it, we have to change the semantics the + // ordering of the results + // double distance = new Levenshtein().getSimilarity(pageName, pPattern); + double distance = new LevenshteinStringDistance().distance(pageName, pattern); + + distanceMap.put(pageID, distance); + + // if there are more than "pSize" entries in the map remove the last one (it has the + // biggest distance) + if (distanceMap.size() > pSize) { + Set<Entry<Integer, Double>> valueSortedSet = new TreeSet<>(new ValueComparator()); + valueSortedSet.addAll(distanceMap.entrySet()); + Iterator<Entry<Integer, Double>> it = valueSortedSet.iterator(); + // remove the first element + if (it.hasNext()) { + // get the id of this entry and remove it in the distanceMap + distanceMap.remove(it.next().getKey()); + } + } + } + session.getTransaction().commit(); + + for (int pageID : distanceMap.keySet()) { + Page page = null; + try { + page = this.getPage(pageID); + } + catch (WikiPageNotFoundException e) { + logger.error("Page with pageID " + pageID + + " could not be found. Fatal error. Terminating."); + e.printStackTrace(); + System.exit(1); + } + pageMap.put(page, distanceMap.get(pageID)); + } + + return pageMap; + } + + /** + * Gets the category for a given title. If the {@link Category} title start with a lowercase + * letter it converts it to an uppercase letter, as each Wikipedia category title starts with an + * uppercase letter. Spaces in the title are converted to underscores, as this is a convention + * for Wikipedia category titles. + * <p> + * For example, the (possible) category "Famous steamboats" could be queried with - "Famous + * steamboats" - "Famous_steamboats" - "famous steamboats" - "famous_steamboats" + * + * @param title + * The title of the category. + * @return The category object with the given title. + * @throws WikiApiException + * If no category with the given title exists. + */ + public Category getCategory(String title) throws WikiApiException + { + return new Category(this, title); + } + + /** + * Gets the category for a given pageId. + * + * @param pageId + * The id of the {@link Category}. + * @return The category object or null if no category with this pageId exists. + */ + public Category getCategory(int pageId) + { + long hibernateId = __getCategoryHibernateId(pageId); + if (hibernateId == -1) { + return null; + } + + try { + return new Category(this, hibernateId); + } + catch (WikiPageNotFoundException e) { + return null; + } + } + + /** + * This returns an iterable over all {@link Category categories}, as returning all category + * objects would be much too expensive. + * + * @return An iterable over all categories. + */ + public Iterable<Category> getCategories() + { + return new CategoryIterable(this); + } + + /** + * Gets the {@link Category categories} for a given {@link Page} identified by its + * {@code pageTitle}. + * + * @param pageTitle + * The title of a {@link Page}, not a category. + * @return The category objects which are associated with the given {@code pageTitle}. + * @throws WikiPageNotFoundException + * Thrown if no {@link Page} exists for the given {@code pageTitle}. + */ + public Set<Category> getCategories(String pageTitle) throws WikiPageNotFoundException + { + if (pageTitle == null || pageTitle.length() == 0) { + throw new WikiPageNotFoundException(); + } + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select c from Page p left join p.categories c where p.name = :pageTitle"; + List<Integer> categoryHibernateIds = session.createQuery(sql, Integer.class) + .setParameter("pageTitle", pageTitle).list(); + session.getTransaction().commit(); + + Set<Category> categorySet = new HashSet<>(categoryHibernateIds.size()); + for (int hibernateId : categoryHibernateIds) { + try { + categorySet.add(new Category(this, hibernateId)); + } + catch (WikiPageNotFoundException e) { + logger.warn("Could not load Category by it's HibernateId = '" + hibernateId + "'"); + } + } + return categorySet; + } + + /** + * Get all wikipedia {@link Category categories}. Returns only an iterable, as a collection may + * not fit into memory for a large wikipedia. + * + * @param bufferSize + * The size of the internal page buffer. + * @return An iterable over all categories. + */ + protected Iterable<Category> getCategories(int bufferSize) + { + return new CategoryIterable(this, bufferSize); + } + + /** + * Protected method that is much faster than the public version, but exposes too much + * implementation details. Get a set with all category pageIDs. Returning all category objects + * is much too expensive. + * + * @return A set with all category pageIDs + */ + // TODO this should be replaced with the buffered category iterator, as it might produce an + // HeapSpace Overflow, if there are too many categories. + protected Set<Integer> __getCategories() + { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select cat.pageId from Category as cat"; + List<Integer> idList = session.createQuery(sql, Integer.class).list(); + session.getTransaction().commit(); + + return new HashSet<>(idList); + } + + /** + * Get all wikipedia pages. Does not include redirects, as they are only pointers to real pages. + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * + * @return An iterable over all pages. + */ + public Iterable<Page> getPages() + { + return new PageIterable(this, false); + } + + /** + * Get all wikipedia pages. Does not include redirects, as they are only pointers to real pages. + * Returns only an iterable, as a collection may not fit into memory for a large wikipedia. + * + * @param bufferSize + * The size of the internal page buffer. + * @return An iterable over all pages. + */ + protected Iterable<Page> getPages(int bufferSize) + { + return new PageIterable(this, false, bufferSize); + } + + /** + * Protected method that is much faster than the public version, but exposes too much + * implementation details. Get a set with all {@code pageIDs}. Returning all page objects is + * much too expensive. Does not include redirects, as they are only pointers to real pages. + * <p> + * As ids can be useful for several application (e.g. in combination with the RevisionMachine, + * they have been made publicly available via {@link #getPageIds()}. + * + * @return A set with all {@code pageIDs}. Returning all pages is much to expensive. + */ + protected Set<Integer> __getPages() + { + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select page.pageId from Page as page"; + List<Integer> idList = session.createQuery(sql, Integer.class).list(); + session.getTransaction().commit(); + + return new HashSet<>(idList); + } + + /** + * @return an iterable over all {@code pageIDs} (without redirects) + */ + public Iterable<Integer> getPageIds() + { + return this.__getPages(); + } + + /** + * Get the pages that match the given query. Does not include redirects, as they are only + * pointers to real pages. Attention: may be running very slow, depending on the size of the + * Wikipedia! + * + * @param query + * A query object containing the query conditions. + * @return A set of pages that match the given query. + * @throws WikiApiException + * Thrown if errors occurred. + */ + public Iterable<Page> getPages(PageQuery query) throws WikiApiException + { + return new PageQueryIterable(this, query); + } + + /** + * Get all articles (pages MINUS disambiguationPages MINUS redirects). Returns only an iterable, + * as a collection may not fit into memory for a large wikipedia. + * + * @return An iterable of all article pages. + */ + public Iterable<Page> getArticles() + { + return new PageIterable(this, true); + } + + /** + * Get all titles including disambiguation pages and redirects). Returns only an iterable, as a + * collection may not fit into memory for a large wikipedia. + * + * @return An iterable of all article pages. + */ + public Iterable<Title> getTitles() + { + return new TitleIterable(this); + } + + /** + * @return The {@link Language} of this Wikipedia. + */ + public Language getLanguage() + { + return this.language; + } + + /** + * Tests, whether a page or redirect with the given title exists. Trying to retrieve a page that + * does not exist in Wikipedia throws an exception. You may catch the exception or use this + * test, depending on your task. + * + * @param title + * The title of the page. + * @return {@code True}, if a page or redirect with that title exits, {@code false} otherwise. + */ + public boolean existsPage(String title) + { + + if (title == null || title.isEmpty()) { + return false; + } + Title t; + try { + t = new Title(title); + } + catch (WikiTitleParsingException e) { + return false; + } + String encodedTitle = t.getWikiStyleTitle(); + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String query = "select p.id from PageMapLine as p where p.name = :pName"; + if (dbConfig.supportsCollation()) { + query += SQL_COLLATION; + } + Object returnValue = session.createNativeQuery(query) + .setParameter("pName", encodedTitle, StandardBasicTypes.STRING).uniqueResult(); + session.getTransaction().commit(); + + return returnValue != null; + } + + /** + * Tests, whether a page with the given pageID exists. Trying to retrieve a pageID that does not + * exist in Wikipedia throws an exception. + * + * @param pageID + * A pageID. + * @return {@code True}, if a page with that pageID exits, {@code false} otherwise. + */ + public boolean existsPage(int pageID) + { + + // This is a hack to provide a much quicker way to test whether a page exists. + // Encoding the title in this way surpasses the normal way of creating a title first. + // Anyway, I do not like this hack :-| + if (pageID < 0) { + return false; + } + + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select p.id from PageMapLine as p where p.pageID = :pageId"; + Long returnValue = session.createNativeQuery(sql, Long.class) + .setParameter("pageId", pageID, StandardBasicTypes.INTEGER).uniqueResult(); + session.getTransaction().commit(); + + return returnValue != null; + } + + /** + * Get the hibernate ID to a given pageID of a page. We need different methods for pages and + * categories here, as a page and a category can have the same ID. + * + * @param pageID + * A pageID that should be mapped to the corresponding hibernate ID. + * @return The hibernateID of the page with pageID or -1, if the pageID is not valid + */ + protected long __getPageHibernateId(int pageID) + { + long hibernateID = -1; + + // first look in the id mapping cache + if (idMapPages.containsKey(pageID)) { + return idMapPages.get(pageID); + } + + // The id was not found in the id mapping cache. + // It may not be in the cahe or may not exist at all. + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select page.id from Page as page where page.pageId = :pageId"; + Long retObjectPage = session.createQuery(sql, Long.class) + .setParameter("pageId", pageID, StandardBasicTypes.INTEGER).uniqueResult(); + session.getTransaction().commit(); + if (retObjectPage != null) { + hibernateID = retObjectPage; + // add it to the cache + idMapPages.put(pageID, hibernateID); + return hibernateID; + } + + return hibernateID; + } + + /** + * Get the hibernate ID to a given pageID of a category. We need different methods for pages and + * categories here, as a page and a category can have the same ID. + * + * @param pageID + * A pageID that should be mapped to the corresponding hibernate ID. + * @return The hibernateID of the page with pageID or -1, if the pageID is not valid + */ + protected long __getCategoryHibernateId(int pageID) + { + long hibernateID = -1; + + // first look in the id mapping cache + if (idMapCategories.containsKey(pageID)) { + return idMapCategories.get(pageID); + } + + // The id was not found in the id mapping cache. + // It may not be in the cahe or may not exist at all. + Session session = this.__getHibernateSession(); + session.beginTransaction(); + String sql = "select cat.id from Category as cat where cat.pageId = :pageId"; + Long retObjectPage = session.createQuery(sql, Long.class) + .setParameter("pageId", pageID, StandardBasicTypes.INTEGER).uniqueResult(); + session.getTransaction().commit(); + if (retObjectPage != null) { + hibernateID = retObjectPage; + // add it to the cache + idMapCategories.put(pageID, hibernateID); + } + + return hibernateID; + } + + /** + * @return A {@link MetaData} object containing all meta data about this instance of Wikipedia. + */ + public MetaData getMetaData() + { + return this.metaData; + } + + /** + * @return The {@link DatabaseConfiguration} object that was used to create the Wikipedia + * object. + */ + public DatabaseConfiguration getDatabaseConfiguration() + { + return this.dbConfig; + } + + /** + * @return Shortcut for getting a hibernate session. + */ + protected Session __getHibernateSession() + { + return WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); + } + + /** + * The ID consists of the host, the database, and the language. This should be unique in most + * cases. + * + * @return Returns a unique ID for this Wikipedia object. + */ + public String getWikipediaId() + { + StringBuilder sb = new StringBuilder(); + sb.append(this.getDatabaseConfiguration().getHost()); + sb.append("_"); + sb.append(this.getDatabaseConfiguration().getDatabase()); + sb.append("_"); + sb.append(this.getDatabaseConfiguration().getLanguage()); + return sb.toString(); + } } -class ValueComparator implements Comparator<Entry<Integer,Double>> { +class ValueComparator + implements Comparator<Entry<Integer, Double>> +{ - @Override - public int compare(Entry<Integer, Double> e1, Entry<Integer, Double> e2) { - return Double.compare(e2.getValue(), e1.getValue()); - } + @Override + public int compare(Entry<Integer, Double> e1, Entry<Integer, Double> e2) + { + return Double.compare(e2.getValue(), e1.getValue()); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java index 27b5dc13..407eec43 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/WikipediaInfo.java @@ -34,389 +34,443 @@ import org.slf4j.LoggerFactory; /** - * Holds numerous information on a given subset (that may also be - * the whole Wikipedia) of Wikipedia nodes. + * Holds numerous information on a given subset (that may also be the whole Wikipedia) of Wikipedia + * nodes. */ -public class WikipediaInfo { +public class WikipediaInfo +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - private Iterable<Page> pages; - private double averageFanOut; + private Iterable<Page> pages; + private double averageFanOut; - private int numberOfPages; + private int numberOfPages; - private Map<Integer, Integer> degreeDistribution; - private Set<Integer> categorizedArticleSet; + private Map<Integer, Integer> degreeDistribution; + private Set<Integer> categorizedArticleSet; - private Wikipedia wiki; + private Wikipedia wiki; - /** - * Get infos for the whole wikipedia. - * - * @param pWiki The wiki object. - */ - public WikipediaInfo(Wikipedia pWiki) throws WikiApiException { - this.wiki = pWiki; - new WikipediaInfo(this.wiki.getPages()); + /** + * Get infos for the whole wikipedia. + * + * @param pWiki + * The wiki object. + */ + public WikipediaInfo(Wikipedia pWiki) throws WikiApiException + { + this.wiki = pWiki; + new WikipediaInfo(this.wiki.getPages()); - } - - - /** - * Get infos only for a subset of articles. - * - * @param pPages A set of pages. Only this subset of wiki pages is used in the info object. - */ - public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException { - if (pPages == null) { - throw new WikiApiException("The page set has to be initialized."); } - pages = pPages; - averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is accessed + /** + * Get infos only for a subset of articles. + * + * @param pPages + * A set of pages. Only this subset of wiki pages is used in the info object. + */ + public WikipediaInfo(Iterable<Page> pPages) throws WikiApiException + { + if (pPages == null) { + throw new WikiApiException("The page set has to be initialized."); + } - degreeDistribution = new HashMap<>(); - categorizedArticleSet = new HashSet<>(); + pages = pPages; + averageFanOut = -1.0; // lazy initialization => it is computed and stored when it is + // accessed - // get number of pages - numberOfPages = 0; - while (pages.iterator().hasNext()) { - numberOfPages++; - pages.iterator().next(); - } + degreeDistribution = new HashMap<>(); + categorizedArticleSet = new HashSet<>(); - } + // get number of pages + numberOfPages = 0; + while (pages.iterator().hasNext()) { + numberOfPages++; + pages.iterator().next(); + } + } - /** - * Computes the average fan out of the page set. - * Fan out is the number of outgoing links per page. - * - * @param pages The pages in an iterable form. - * @return The average fan out. - */ - private double computeAverageFanOut(Iterable<Page> pages) { + /** + * Computes the average fan out of the page set. Fan out is the number of outgoing links per + * page. + * + * @param pages + * The pages in an iterable form. + * @return The average fan out. + */ + private double computeAverageFanOut(Iterable<Page> pages) + { + + Set<Integer> pageIDs = new HashSet<>(); + while (pages.iterator().hasNext()) { + pageIDs.add(pages.iterator().next().getPageId()); + } - Set<Integer> pageIDs = new HashSet<>(); - while (pages.iterator().hasNext()) { - pageIDs.add(pages.iterator().next().getPageId()); - } + if (pageIDs.isEmpty()) { + logger.warn("Cannot compute average fan-out of an empty page set."); + return 0.0; + } - if (pageIDs.isEmpty()) { - logger.warn("Cannot compute average fan-out of an empty page set."); - return 0.0; - } + int fanOutCounter = 0; - int fanOutCounter = 0; + Session session = this.wiki.__getHibernateSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page") + .list()) { + Object[] row = (Object[]) o; + Set outLinks = (Set) row[0]; + Integer pageId = (Integer) row[1]; - Session session = this.wiki.__getHibernateSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select page.outLinks, page.pageId from Page as page").list()) { - Object[] row = (Object[]) o; - Set outLinks = (Set) row[0]; - Integer pageId = (Integer) row[1]; + // if the current page ID is in the desired result set => add outlink value + if (pageIDs.contains(pageId)) { + fanOutCounter += outLinks.size(); + } + } + session.getTransaction().commit(); - // if the current page ID is in the desired result set => add outlink value - if (pageIDs.contains(pageId)) { - fanOutCounter += outLinks.size(); - } + return (double) fanOutCounter / this.getNumberOfPages(); } - session.getTransaction().commit(); - return (double) fanOutCounter / this.getNumberOfPages(); - } + /** + * @return Returns the averageFanOut. + */ + public double getAverageFanOut() + { + if (averageFanOut < 0) { // not yet initialized + averageFanOut = computeAverageFanOut(this.pages); + } - /** - * @return Returns the averageFanOut. - */ - public double getAverageFanOut() { - if (averageFanOut < 0) { // not yet initialized - averageFanOut = computeAverageFanOut(this.pages); + return averageFanOut; } - return averageFanOut; - } - - /** - * @return Returns the numberOfPages. - */ - public int getNumberOfPages() { - return numberOfPages; - } - - /** - * Building a mapping from categories to article sets. - * - * @param pWiki The wikipedia object. - * @param pNodes The category nodes that should be used to build the map. - * @return A mapping from categories to article sets. - * @throws WikiPageNotFoundException - */ - private Map<Integer, Set<Integer>> getCategoryArticleMap(Wikipedia pWiki, Set<Integer> pNodes) throws WikiPageNotFoundException { - Map<Integer, Set<Integer>> categoryArticleMap = new HashMap<>(); - - int progress = 0; - for (int node : pNodes) { - progress++; - ApiUtilities.printProgressInfo(progress, pNodes.size(), 10, ApiUtilities.ProgressInfoMode.TEXT, "Getting category-article map."); - - Category cat = pWiki.getCategory(node); - if (cat != null) { - Set<Integer> pages = new HashSet<>(cat.__getPages()); - categoryArticleMap.put(node, pages); - } else { - logger.info("{} is not a category.", node); - } + /** + * @return Returns the numberOfPages. + */ + public int getNumberOfPages() + { + return numberOfPages; } - return categoryArticleMap; - } - - /** - * Get various graph parameters like diameter, average out-degree etc. of the category graph. - * - * @param catGraph The category graph. - */ - public void getGraphParameters(CategoryGraph catGraph) { - double startTime = System.currentTimeMillis(); - logger.error(catGraph.getGraphInfo()); - double endTime = (System.currentTimeMillis() - startTime) / 1000.0; - logger.error(endTime + "s"); - } - - /** - * Articles in wikipedia may be tagged with multiple categories. - * It may be interesting to know how many articles have at least one category in common. - * Such articles would have a very high semantic relatedness even if they share a quite secondary category. - * - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @throws WikiApiException - */ - public void getOverlapping(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { - double startTime = System.currentTimeMillis(); - - int articlesWithOverlappingCategories = getArticlesWithOverlappingCategories(pWiki, catGraph); - double overlappingCategoriesRatio = (double) articlesWithOverlappingCategories / (double) pWiki.getMetaData().getNumberOfPages(); - logger.info(articlesWithOverlappingCategories + " - " + pWiki.getMetaData().getNumberOfPages() + " - " + overlappingCategoriesRatio); - - double endTime = (System.currentTimeMillis() - startTime) / 1000.0; - logger.debug("{} ms", endTime); - } - - - /** - * Articles in wikipedia may be tagged with multiple categories. - * It may be interesting to know how many articles have at least one category in common. - * Such articles would have a very high semantic relatedness even if they share a quite secondary category. - * - * @param pWiki The wikipedia object. - * @param pGraph The category graph. - * @return The number of articles that have at least one category in common. - * @throws WikiPageNotFoundException - */ - private int getArticlesWithOverlappingCategories(Wikipedia pWiki, CategoryGraph pGraph) throws WikiPageNotFoundException { - Set<Integer> overlappingArticles = new HashSet<>(); - - // iterate over all node pairs - Set<Integer> nodes = pGraph.getGraph().vertexSet(); - - Map<Integer, Set<Integer>> categoryArticleMap = getCategoryArticleMap(pWiki, nodes); - - // sort the Array so we can use a simple iteration with two for loops to access all pairs - Object[] nodeArray = nodes.toArray(); - Arrays.sort(nodeArray); - - int progress = 0; - for (int i = 0; i < nodes.size(); i++) { - progress++; - ApiUtilities.printProgressInfo(progress, nodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, ""); - - int outerNode = (Integer) nodeArray[i]; - - for (int j = i + 1; j < nodes.size(); j++) { - int innerNode = (Integer) nodeArray[j]; - - // test whether the categories have pages in common - Set<Integer> outerPages = categoryArticleMap.get(outerNode); - Set<Integer> innerPages = categoryArticleMap.get(innerNode); - - for (int outerPage : outerPages) { - if (innerPages.contains(outerPage)) { - if (!overlappingArticles.contains(outerPage)) { - overlappingArticles.add(outerPage); + /** + * Building a mapping from categories to article sets. + * + * @param pWiki + * The wikipedia object. + * @param pNodes + * The category nodes that should be used to build the map. + * @return A mapping from categories to article sets. + * @throws WikiPageNotFoundException + */ + private Map<Integer, Set<Integer>> getCategoryArticleMap(Wikipedia pWiki, Set<Integer> pNodes) + throws WikiPageNotFoundException + { + Map<Integer, Set<Integer>> categoryArticleMap = new HashMap<>(); + + int progress = 0; + for (int node : pNodes) { + progress++; + ApiUtilities.printProgressInfo(progress, pNodes.size(), 10, + ApiUtilities.ProgressInfoMode.TEXT, "Getting category-article map."); + + Category cat = pWiki.getCategory(node); + if (cat != null) { + Set<Integer> pages = new HashSet<>(cat.__getPages()); + categoryArticleMap.put(node, pages); + } + else { + logger.info("{} is not a category.", node); } - } } - } + return categoryArticleMap; + } + + /** + * Get various graph parameters like diameter, average out-degree etc. of the category graph. + * + * @param catGraph + * The category graph. + */ + public void getGraphParameters(CategoryGraph catGraph) + { + double startTime = System.currentTimeMillis(); + logger.error(catGraph.getGraphInfo()); + double endTime = (System.currentTimeMillis() - startTime) / 1000.0; + logger.error(endTime + "s"); + } + + /** + * Articles in wikipedia may be tagged with multiple categories. It may be interesting to know + * how many articles have at least one category in common. Such articles would have a very high + * semantic relatedness even if they share a quite secondary category. + * + * @param pWiki + * The wikipedia object. + * @param catGraph + * The category graph. + * @throws WikiApiException + */ + public void getOverlapping(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException + { + double startTime = System.currentTimeMillis(); + + int articlesWithOverlappingCategories = getArticlesWithOverlappingCategories(pWiki, + catGraph); + double overlappingCategoriesRatio = (double) articlesWithOverlappingCategories + / (double) pWiki.getMetaData().getNumberOfPages(); + logger.info(articlesWithOverlappingCategories + " - " + + pWiki.getMetaData().getNumberOfPages() + " - " + overlappingCategoriesRatio); + + double endTime = (System.currentTimeMillis() - startTime) / 1000.0; + logger.debug("{} ms", endTime); } - return overlappingArticles.size(); - } + /** + * Articles in wikipedia may be tagged with multiple categories. It may be interesting to know + * how many articles have at least one category in common. Such articles would have a very high + * semantic relatedness even if they share a quite secondary category. + * + * @param pWiki + * The wikipedia object. + * @param pGraph + * The category graph. + * @return The number of articles that have at least one category in common. + * @throws WikiPageNotFoundException + */ + private int getArticlesWithOverlappingCategories(Wikipedia pWiki, CategoryGraph pGraph) + throws WikiPageNotFoundException + { + Set<Integer> overlappingArticles = new HashSet<>(); + + // iterate over all node pairs + Set<Integer> nodes = pGraph.getGraph().vertexSet(); + + Map<Integer, Set<Integer>> categoryArticleMap = getCategoryArticleMap(pWiki, nodes); + + // sort the Array so we can use a simple iteration with two for loops to access all pairs + Object[] nodeArray = nodes.toArray(); + Arrays.sort(nodeArray); + + int progress = 0; + for (int i = 0; i < nodes.size(); i++) { + progress++; + ApiUtilities.printProgressInfo(progress, nodes.size(), 100, + ApiUtilities.ProgressInfoMode.TEXT, ""); + + int outerNode = (Integer) nodeArray[i]; + + for (int j = i + 1; j < nodes.size(); j++) { + int innerNode = (Integer) nodeArray[j]; + + // test whether the categories have pages in common + Set<Integer> outerPages = categoryArticleMap.get(outerNode); + Set<Integer> innerPages = categoryArticleMap.get(innerNode); + + for (int outerPage : outerPages) { + if (innerPages.contains(outerPage)) { + if (!overlappingArticles.contains(outerPage)) { + overlappingArticles.add(outerPage); + } + } + } - public void getCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { - double startTime = System.currentTimeMillis(); + } + } - int numberOfCategorizedArticles = getNumberOfCategorizedArticles(pWiki, catGraph); - double categorizedArticlesRatio = (double) numberOfCategorizedArticles / (double) pWiki.getMetaData().getNumberOfPages(); + return overlappingArticles.size(); + } - logger.info("Categorized articles: {}", numberOfCategorizedArticles); - logger.info("All articles: {}", pWiki.getMetaData().getNumberOfPages()); - logger.info("Ratio: {}", categorizedArticlesRatio); + public void getCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) + throws WikiApiException + { + double startTime = System.currentTimeMillis(); - double endTime = (System.currentTimeMillis() - startTime) / 1000.0; - logger.debug("{}ms", endTime); - } + int numberOfCategorizedArticles = getNumberOfCategorizedArticles(pWiki, catGraph); + double categorizedArticlesRatio = (double) numberOfCategorizedArticles + / (double) pWiki.getMetaData().getNumberOfPages(); - public double getAveragePathLengthFromRoot(Wikipedia pWiki, CategoryGraph connectedCatGraph) throws WikiApiException { - // get root node - Category rootCategory = pWiki.getMetaData().getMainCategory(); - int root = rootCategory.getPageId(); + logger.info("Categorized articles: {}", numberOfCategorizedArticles); + logger.info("All articles: {}", pWiki.getMetaData().getNumberOfPages()); + logger.info("Ratio: {}", categorizedArticlesRatio); - int pathLengthSum = computeShortestPathLenghts(root, connectedCatGraph); + double endTime = (System.currentTimeMillis() - startTime) / 1000.0; + logger.debug("{}ms", endTime); + } - return (double) pathLengthSum / (connectedCatGraph.getGraph().vertexSet().size() - 1); - } + public double getAveragePathLengthFromRoot(Wikipedia pWiki, CategoryGraph connectedCatGraph) + throws WikiApiException + { + // get root node + Category rootCategory = pWiki.getMetaData().getMainCategory(); + int root = rootCategory.getPageId(); + int pathLengthSum = computeShortestPathLenghts(root, connectedCatGraph); - /** - * If the return value has been already computed, it is returned, else it is computed at retrieval time. - * - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @return The number of categorized articles, i.e. articles that have at least one category. - */ - public int getNumberOfCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiApiException { - if (categorizedArticleSet == null) { // has not been initialized yet - iterateCategoriesGetArticles(pWiki, catGraph); + return (double) pathLengthSum / (connectedCatGraph.getGraph().vertexSet().size() - 1); } - return categorizedArticleSet.size(); - } - - /** - * Computes the distribution of the number of articles per category. - * If the return value has been already computed, it is returned, else it is computed at retrieval time. - * - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @return A map containing the distribution mapping from a degree to the number of times this degree is found in the category graph. - * @throws WikiPageNotFoundException - */ - public Map<Integer, Integer> getDistributionOfArticlesByCategory(Wikipedia pWiki, CategoryGraph catGraph) throws WikiPageNotFoundException { - if (degreeDistribution == null) { // has not been initialized yet - iterateCategoriesGetArticles(pWiki, catGraph); + + /** + * If the return value has been already computed, it is returned, else it is computed at + * retrieval time. + * + * @param pWiki + * The wikipedia object. + * @param catGraph + * The category graph. + * @return The number of categorized articles, i.e. articles that have at least one category. + */ + public int getNumberOfCategorizedArticles(Wikipedia pWiki, CategoryGraph catGraph) + throws WikiApiException + { + if (categorizedArticleSet == null) { // has not been initialized yet + iterateCategoriesGetArticles(pWiki, catGraph); + } + return categorizedArticleSet.size(); } - return degreeDistribution; - } - - /** - * Methods computing stuff that have to iterate over all categories and access category articles can plug-in here. - * Recently plugin-in: - * numberOfCategorizedArticles - * distributionOfArticlesByCategory - * - * @param pWiki The wikipedia object. - * @param catGraph The category graph. - * @throws WikiPageNotFoundException - */ - private void iterateCategoriesGetArticles(Wikipedia pWiki, CategoryGraph catGraph) throws WikiPageNotFoundException { - Map<Integer, Integer> localDegreeDistribution = new HashMap<>(); - Set<Integer> localCategorizedArticleSet = new HashSet<>(); - Set<Integer> categoryNodes = catGraph.getGraph().vertexSet(); - // iterate over all categories - int progress = 0; - for (int node : categoryNodes) { - progress++; - ApiUtilities.printProgressInfo(progress, categoryNodes.size(), 100, ApiUtilities.ProgressInfoMode.TEXT, "iterate over categories"); - - // get the category - Category cat = pWiki.getCategory(node); - if (cat != null) { - Set<Integer> pages = new HashSet<>(cat.__getPages()); - - // update degree distribution map - int numberOfArticles = pages.size(); - if (localDegreeDistribution.containsKey(numberOfArticles)) { - int count = localDegreeDistribution.get(numberOfArticles); - count++; - localDegreeDistribution.put(numberOfArticles, count); - } else { - localDegreeDistribution.put(numberOfArticles, 1); + + /** + * Computes the distribution of the number of articles per category. If the return value has + * been already computed, it is returned, else it is computed at retrieval time. + * + * @param pWiki + * The wikipedia object. + * @param catGraph + * The category graph. + * @return A map containing the distribution mapping from a degree to the number of times this + * degree is found in the category graph. + * @throws WikiPageNotFoundException + */ + public Map<Integer, Integer> getDistributionOfArticlesByCategory(Wikipedia pWiki, + CategoryGraph catGraph) + throws WikiPageNotFoundException + { + if (degreeDistribution == null) { // has not been initialized yet + iterateCategoriesGetArticles(pWiki, catGraph); } + return degreeDistribution; + } - // add the page to the categorized articles set, if it is to already in it - for (int page : pages) { - if (!localCategorizedArticleSet.contains(page)) { - localCategorizedArticleSet.add(page); - } + /** + * Methods computing stuff that have to iterate over all categories and access category articles + * can plug-in here. Recently plugin-in: numberOfCategorizedArticles + * distributionOfArticlesByCategory + * + * @param pWiki + * The wikipedia object. + * @param catGraph + * The category graph. + * @throws WikiPageNotFoundException + */ + private void iterateCategoriesGetArticles(Wikipedia pWiki, CategoryGraph catGraph) + throws WikiPageNotFoundException + { + Map<Integer, Integer> localDegreeDistribution = new HashMap<>(); + Set<Integer> localCategorizedArticleSet = new HashSet<>(); + Set<Integer> categoryNodes = catGraph.getGraph().vertexSet(); + // iterate over all categories + int progress = 0; + for (int node : categoryNodes) { + progress++; + ApiUtilities.printProgressInfo(progress, categoryNodes.size(), 100, + ApiUtilities.ProgressInfoMode.TEXT, "iterate over categories"); + + // get the category + Category cat = pWiki.getCategory(node); + if (cat != null) { + Set<Integer> pages = new HashSet<>(cat.__getPages()); + + // update degree distribution map + int numberOfArticles = pages.size(); + if (localDegreeDistribution.containsKey(numberOfArticles)) { + int count = localDegreeDistribution.get(numberOfArticles); + count++; + localDegreeDistribution.put(numberOfArticles, count); + } + else { + localDegreeDistribution.put(numberOfArticles, 1); + } + + // add the page to the categorized articles set, if it is to already in it + for (int page : pages) { + if (!localCategorizedArticleSet.contains(page)) { + localCategorizedArticleSet.add(page); + } + } + } + else { + logger.info("{} is not a category.", node); + } } - } else { - logger.info("{} is not a category.", node); - } + this.degreeDistribution = localDegreeDistribution; + this.categorizedArticleSet = localCategorizedArticleSet; } - this.degreeDistribution = localDegreeDistribution; - this.categorizedArticleSet = localCategorizedArticleSet; - } - - /** - * Computes the shortest path from node to all other nodes. - * As the JGraphT BreadthFirstIterator does not provide information about - * the distance to the start node in each step, we will use our own BFS implementation. - * - * @param pStartNode The start node of the search. - * @param catGraph The category graph. - * @return An array of double values. - */ - private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) { - int shortestPathLengthSum = 0; - - // a set of nodes that have already been expanded -> algorithm should expand nodes monotonically and not go back - Set<Integer> alreadyExpanded = new HashSet<>(); - - // a queue holding the newly discovered nodes with their and their distance to the start node - List<int[]> queue = new ArrayList<>(); - - // initialize queue with start node - int[] innerList = new int[2]; - innerList[0] = pStartNode; // the node - innerList[1] = 0; // the distance to the start node - queue.add(innerList); - - // while the queue is not empty - while (!queue.isEmpty()) { - // remove first element from queue - int[] queueElement = queue.get(0); - int currentNode = queueElement[0]; - int distance = queueElement[1]; - queue.remove(0); - - // if the node was not already expanded - if (!alreadyExpanded.contains(currentNode)) { - // the node gets expanded now - alreadyExpanded.add(currentNode); - - // add the distance of this node to shortestPathLengthSum - shortestPathLengthSum += distance; - - // get the neighbors of the queue element - Set<Integer> neighbors = catGraph.getNeighbors(currentNode); - - // iterate over all neighbors - for (int neighbor : neighbors) { - // if the node was not already expanded - if (!alreadyExpanded.contains(neighbor)) { - // add the node to the queue, increase node distance by one - int[] tmpList = new int[2]; - tmpList[0] = neighbor; - tmpList[1] = (distance + 1); - queue.add(tmpList); - } + + /** + * Computes the shortest path from node to all other nodes. As the JGraphT BreadthFirstIterator + * does not provide information about the distance to the start node in each step, we will use + * our own BFS implementation. + * + * @param pStartNode + * The start node of the search. + * @param catGraph + * The category graph. + * @return An array of double values. + */ + private int computeShortestPathLenghts(int pStartNode, CategoryGraph catGraph) + { + int shortestPathLengthSum = 0; + + // a set of nodes that have already been expanded -> algorithm should expand nodes + // monotonically and not go back + Set<Integer> alreadyExpanded = new HashSet<>(); + + // a queue holding the newly discovered nodes with their and their distance to the start + // node + List<int[]> queue = new ArrayList<>(); + + // initialize queue with start node + int[] innerList = new int[2]; + innerList[0] = pStartNode; // the node + innerList[1] = 0; // the distance to the start node + queue.add(innerList); + + // while the queue is not empty + while (!queue.isEmpty()) { + // remove first element from queue + int[] queueElement = queue.get(0); + int currentNode = queueElement[0]; + int distance = queueElement[1]; + queue.remove(0); + + // if the node was not already expanded + if (!alreadyExpanded.contains(currentNode)) { + // the node gets expanded now + alreadyExpanded.add(currentNode); + + // add the distance of this node to shortestPathLengthSum + shortestPathLengthSum += distance; + + // get the neighbors of the queue element + Set<Integer> neighbors = catGraph.getNeighbors(currentNode); + + // iterate over all neighbors + for (int neighbor : neighbors) { + // if the node was not already expanded + if (!alreadyExpanded.contains(neighbor)) { + // add the node to the queue, increase node distance by one + int[] tmpList = new int[2]; + tmpList[0] = neighbor; + tmpList[1] = (distance + 1); + queue.add(tmpList); + } + } + } } - } + return shortestPathLengthSum; } - return shortestPathLengthSum; - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java index 0efd9d6a..7d5f38d6 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiApiException.java @@ -17,25 +17,30 @@ */ package org.dkpro.jwpl.api.exception; - -public class WikiApiException extends WikiException { - - private static final long serialVersionUID = 4780158247277092677L; - - public WikiApiException() { - super(); - } - - public WikiApiException(String txt) { - super(txt); - } - - public WikiApiException(String message, Throwable cause) { - super(message, cause); - } - - public WikiApiException(Throwable cause) { - super(cause); - } +public class WikiApiException + extends WikiException +{ + + private static final long serialVersionUID = 4780158247277092677L; + + public WikiApiException() + { + super(); + } + + public WikiApiException(String txt) + { + super(txt); + } + + public WikiApiException(String message, Throwable cause) + { + super(message, cause); + } + + public WikiApiException(Throwable cause) + { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java index 4609de3b..a242cd0a 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiException.java @@ -17,23 +17,29 @@ */ package org.dkpro.jwpl.api.exception; -public class WikiException extends Exception { +public class WikiException + extends Exception +{ - private static final long serialVersionUID = 3891003920835683241L; + private static final long serialVersionUID = 3891003920835683241L; - public WikiException() { - super(); - } + public WikiException() + { + super(); + } - public WikiException(String txt) { - super(txt); - } + public WikiException(String txt) + { + super(txt); + } - public WikiException(String message, Throwable cause) { - super(message, cause); - } + public WikiException(String message, Throwable cause) + { + super(message, cause); + } - public WikiException(Throwable cause) { - super(cause); - } + public WikiException(Throwable cause) + { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java index 608061f2..7d965054 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiInitializationException.java @@ -17,28 +17,32 @@ */ package org.dkpro.jwpl.api.exception; - /** * Thrown, when the Wikipedia object could not be properly initialized. */ -public class WikiInitializationException extends WikiApiException { - - - private static final long serialVersionUID = 7240072132466204183L; - - public WikiInitializationException() { - super(); - } - - public WikiInitializationException(String txt) { - super(txt); - } - - public WikiInitializationException(String message, Throwable cause) { - super(message, cause); - } - - public WikiInitializationException(Throwable cause) { - super(cause); - } +public class WikiInitializationException + extends WikiApiException +{ + + private static final long serialVersionUID = 7240072132466204183L; + + public WikiInitializationException() + { + super(); + } + + public WikiInitializationException(String txt) + { + super(txt); + } + + public WikiInitializationException(String message, Throwable cause) + { + super(message, cause); + } + + public WikiInitializationException(Throwable cause) + { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java index 026be6e0..39faaff8 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiPageNotFoundException.java @@ -17,28 +17,32 @@ */ package org.dkpro.jwpl.api.exception; - /** * Thrown when a requested page or category could not be found in Wikipedia. */ -public class WikiPageNotFoundException extends WikiApiException { - - - private static final long serialVersionUID = -3676016515948761351L; - - public WikiPageNotFoundException() { - super(); - } - - public WikiPageNotFoundException(String txt) { - super(txt); - } - - public WikiPageNotFoundException(String message, Throwable cause) { - super(message, cause); - } - - public WikiPageNotFoundException(Throwable cause) { - super(cause); - } +public class WikiPageNotFoundException + extends WikiApiException +{ + + private static final long serialVersionUID = -3676016515948761351L; + + public WikiPageNotFoundException() + { + super(); + } + + public WikiPageNotFoundException(String txt) + { + super(txt); + } + + public WikiPageNotFoundException(String message, Throwable cause) + { + super(message, cause); + } + + public WikiPageNotFoundException(Throwable cause) + { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java index 3480601a..f5b4ba11 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/exception/WikiTitleParsingException.java @@ -17,27 +17,32 @@ */ package org.dkpro.jwpl.api.exception; - /** * Thrown when an exceptional situation occurs during parsing a page title to create a Title object. */ -public class WikiTitleParsingException extends WikiApiException { +public class WikiTitleParsingException + extends WikiApiException +{ - private static final long serialVersionUID = 7152744066557304950L; + private static final long serialVersionUID = 7152744066557304950L; - public WikiTitleParsingException() { - super(); - } + public WikiTitleParsingException() + { + super(); + } - public WikiTitleParsingException(String txt) { - super(txt); - } + public WikiTitleParsingException(String txt) + { + super(txt); + } - public WikiTitleParsingException(String message, Throwable cause) { - super(message, cause); - } + public WikiTitleParsingException(String message, Throwable cause) + { + super(message, cause); + } - public WikiTitleParsingException(Throwable cause) { - super(cause); - } + public WikiTitleParsingException(Throwable cause) + { + super(cause); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java index 4448f525..733e9298 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Category.java @@ -20,67 +20,80 @@ import java.util.HashSet; import java.util.Set; -public class Category { - private long id; - private int pageId; - private String name; - private Set<Integer> inLinks = new HashSet<>(); - private Set<Integer> outLinks = new HashSet<>(); - private Set<Integer> pages = new HashSet<>(); - - /** - * A no argument constructor as required by Hibernate. - */ - public Category() { - } - - - public long getId() { - return id; - } - - @SuppressWarnings("unused") - private void setId(long id) { - this.id = id; - } - - public int getPageId() { - return pageId; - } - - public void setPageId(int pageId) { - this.pageId = pageId; - } - - public Set<Integer> getInLinks() { - return inLinks; - } - - public void setInLinks(Set<Integer> inLinks) { - this.inLinks = inLinks; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public Set<Integer> getOutLinks() { - return outLinks; - } - - public void setOutLinks(Set<Integer> outLinks) { - this.outLinks = outLinks; - } - - public Set<Integer> getPages() { - return pages; - } - - public void setPages(Set<Integer> pages) { - this.pages = pages; - } +public class Category +{ + private long id; + private int pageId; + private String name; + private Set<Integer> inLinks = new HashSet<>(); + private Set<Integer> outLinks = new HashSet<>(); + private Set<Integer> pages = new HashSet<>(); + + /** + * A no argument constructor as required by Hibernate. + */ + public Category() + { + } + + public long getId() + { + return id; + } + + @SuppressWarnings("unused") + private void setId(long id) + { + this.id = id; + } + + public int getPageId() + { + return pageId; + } + + public void setPageId(int pageId) + { + this.pageId = pageId; + } + + public Set<Integer> getInLinks() + { + return inLinks; + } + + public void setInLinks(Set<Integer> inLinks) + { + this.inLinks = inLinks; + } + + public String getName() + { + return name; + } + + public void setName(String name) + { + this.name = name; + } + + public Set<Integer> getOutLinks() + { + return outLinks; + } + + public void setOutLinks(Set<Integer> outLinks) + { + this.outLinks = outLinks; + } + + public Set<Integer> getPages() + { + return pages; + } + + public void setPages(Set<Integer> pages) + { + this.pages = pages; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java index 86a7d5f2..d2f1442e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/CategoryDAO.java @@ -29,47 +29,57 @@ * @author Hibernate Tools * @see org.dkpro.jwpl.api.Category */ -public class CategoryDAO extends GenericDAO<Category> { +public class CategoryDAO + extends GenericDAO<Category> +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - public CategoryDAO(Wikipedia pWiki) { - super(pWiki, Category.class); - } + public CategoryDAO(Wikipedia pWiki) + { + super(pWiki, Category.class); + } - @Override - public void persist(Category transientInstance) { - logger.debug("persisting Category instance"); - super.persist(transientInstance); - } + @Override + public void persist(Category transientInstance) + { + logger.debug("persisting Category instance"); + super.persist(transientInstance); + } - @Override - public void attachDirty(Category instance) { - logger.debug("attaching dirty Category instance"); - super.attachDirty(instance); - } + @Override + public void attachDirty(Category instance) + { + logger.debug("attaching dirty Category instance"); + super.attachDirty(instance); + } - @Override - public void attachClean(Category instance) { - logger.debug("attaching clean Category instance"); - super.attachClean(instance); - } + @Override + public void attachClean(Category instance) + { + logger.debug("attaching clean Category instance"); + super.attachClean(instance); + } - @Override - public void delete(Category persistentInstance) { - logger.debug("deleting Category instance"); - super.delete(persistentInstance); - } + @Override + public void delete(Category persistentInstance) + { + logger.debug("deleting Category instance"); + super.delete(persistentInstance); + } - @Override - public Category merge(Category detachedInstance) { - logger.debug("merging Category instance"); - return super.merge(detachedInstance); - } + @Override + public Category merge(Category detachedInstance) + { + logger.debug("merging Category instance"); + return super.merge(detachedInstance); + } - @Override - public Category findById(java.lang.Long id) { - logger.debug("getting Category instance with id: " + id); - return super.findById(id); - } + @Override + public Category findById(java.lang.Long id) + { + logger.debug("getting Category instance with id: " + id); + return super.findById(id); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java index 3273bf60..4595a302 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/GenericDAO.java @@ -29,103 +29,124 @@ /** * A common base class for DAO classes. * - * @param <T> The entity type to provide persistence features for. + * @param <T> + * The entity type to provide persistence features for. */ -public abstract class GenericDAO<T> { +public abstract class GenericDAO<T> +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - private final Wikipedia wiki; - private final SessionFactory sessionFactory; + private final Wikipedia wiki; + private final SessionFactory sessionFactory; - private final String entityClass; + private final String entityClass; - GenericDAO(Wikipedia wiki, Class<?> entityClass) { - this.wiki = wiki; - this.entityClass = entityClass.getName(); - this.sessionFactory = initializeSessionFactory(); - } + GenericDAO(Wikipedia wiki, Class<?> entityClass) + { + this.wiki = wiki; + this.entityClass = entityClass.getName(); + this.sessionFactory = initializeSessionFactory(); + } - private SessionFactory initializeSessionFactory() { - try { - return WikiHibernateUtil.getSessionFactory(wiki.getDatabaseConfiguration()); - } catch (Exception e) { - throw new IllegalStateException("Could not locate SessionFactory in JNDI", e); + private SessionFactory initializeSessionFactory() + { + try { + return WikiHibernateUtil.getSessionFactory(wiki.getDatabaseConfiguration()); + } + catch (Exception e) { + throw new IllegalStateException("Could not locate SessionFactory in JNDI", e); + } } - } - - private SessionFactory getSessionFactory() { - return sessionFactory; - } - - protected Session getSession() { - return getSessionFactory().getCurrentSession(); - } - - public void persist(T transientInstance) { - logger.debug("persisting MetaData instance"); - try { - getSession().persist(transientInstance); - logger.trace("persist successful"); - } catch (RuntimeException re) { - logger.error("Failed persisting " + entityClass + " instance", re); - throw re; + + private SessionFactory getSessionFactory() + { + return sessionFactory; } - } - - public void delete(T persistentInstance) { - try { - getSession().remove(persistentInstance); - logger.trace("delete successful"); - } catch (RuntimeException re) { - logger.error("Failed deleting {} instance", entityClass, re); - throw re; + + protected Session getSession() + { + return getSessionFactory().getCurrentSession(); } - } - - public T merge(T detachedInstance) { - try { - T result = (T) getSession().merge(detachedInstance); - logger.trace("merge successful"); - return result; - } catch (RuntimeException re) { - logger.error("Failed merging " + entityClass + " instance", re); - throw re; + + public void persist(T transientInstance) + { + logger.debug("persisting MetaData instance"); + try { + getSession().persist(transientInstance); + logger.trace("persist successful"); + } + catch (RuntimeException re) { + logger.error("Failed persisting " + entityClass + " instance", re); + throw re; + } } - } - - public void attachClean(T instance) { - try { - getSession().buildLockRequest(LockOptions.NONE).lock(instance); - logger.trace("attach successful"); - } catch (RuntimeException re) { - logger.error("Failed attaching " + entityClass + " instance", re); - throw re; + + public void delete(T persistentInstance) + { + try { + getSession().remove(persistentInstance); + logger.trace("delete successful"); + } + catch (RuntimeException re) { + logger.error("Failed deleting {} instance", entityClass, re); + throw re; + } } - } - - public void attachDirty(T instance) { - try { - getSession().merge(instance); - logger.trace("attach successful"); - } catch (RuntimeException re) { - logger.error("attach failed", re); - throw re; + + public T merge(T detachedInstance) + { + try { + T result = (T) getSession().merge(detachedInstance); + logger.trace("merge successful"); + return result; + } + catch (RuntimeException re) { + logger.error("Failed merging " + entityClass + " instance", re); + throw re; + } } - } - - public T findById(Long id) { - try { - T instance = (T) getSession().get(entityClass, id); - if (instance == null) { - logger.trace("get successful, no " + entityClass + " instance found"); - } else { - logger.trace("get successful, instance found"); - } - return instance; - } catch (RuntimeException re) { - logger.error("Failed finding " + entityClass + " instance by id", re); - throw re; + + public void attachClean(T instance) + { + try { + getSession().buildLockRequest(LockOptions.NONE).lock(instance); + logger.trace("attach successful"); + } + catch (RuntimeException re) { + logger.error("Failed attaching " + entityClass + " instance", re); + throw re; + } + } + + public void attachDirty(T instance) + { + try { + getSession().merge(instance); + logger.trace("attach successful"); + } + catch (RuntimeException re) { + logger.error("attach failed", re); + throw re; + } + } + + public T findById(Long id) + { + try { + T instance = (T) getSession().get(entityClass, id); + if (instance == null) { + logger.trace("get successful, no " + entityClass + " instance found"); + } + else { + logger.trace("get successful, instance found"); + } + return instance; + } + catch (RuntimeException re) { + logger.error("Failed finding " + entityClass + " instance by id", re); + throw re; + } } - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java index 87663d02..70854505 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaData.java @@ -17,95 +17,115 @@ */ package org.dkpro.jwpl.api.hibernate; -public class MetaData { - - private long id; - - private String language; - private String disambiguationCategory; - private String mainCategory; - private String version; - - private long nrofPages; - private long nrofRedirects; - private long nrofDisambiguationPages; - private long nrofCategories; - - /** - * A no argument constructor as required by Hibernate. - */ - public MetaData() { - } - - public String getDisambiguationCategory() { - return disambiguationCategory; - } - - public void setDisambiguationCategory(String disambiguationCategory) { - this.disambiguationCategory = disambiguationCategory; - } - - public String getMainCategory() { - return mainCategory; - } - - public void setMainCategory(String mainCategory) { - this.mainCategory = mainCategory; - } - - public long getNrofCategories() { - return nrofCategories; - } - - public long getNrofDisambiguationPages() { - return nrofDisambiguationPages; - } - - public long getNrofPages() { - return nrofPages; - } - - public long getNrofRedirects() { - return nrofRedirects; - } - - public long getId() { - return id; - } - - public void setId(long id) { - this.id = id; - } - - public String getLanguage() { - return language; - } - - public void setLanguage(String language) { - this.language = language; - } - - public void setNrofCategories(long nrofCategories) { - this.nrofCategories = nrofCategories; - } - - public void setNrofDisambiguationPages(long nrofDisambiguationPages) { - this.nrofDisambiguationPages = nrofDisambiguationPages; - } - - public void setNrofPages(long nrofPages) { - this.nrofPages = nrofPages; - } - - public void setNrofRedirects(long nrofRedirects) { - this.nrofRedirects = nrofRedirects; - } - - public String getVersion() { - return version; - } - - public void setVersion(String version) { - this.version = version; - } +public class MetaData +{ + + private long id; + + private String language; + private String disambiguationCategory; + private String mainCategory; + private String version; + + private long nrofPages; + private long nrofRedirects; + private long nrofDisambiguationPages; + private long nrofCategories; + + /** + * A no argument constructor as required by Hibernate. + */ + public MetaData() + { + } + + public String getDisambiguationCategory() + { + return disambiguationCategory; + } + + public void setDisambiguationCategory(String disambiguationCategory) + { + this.disambiguationCategory = disambiguationCategory; + } + + public String getMainCategory() + { + return mainCategory; + } + + public void setMainCategory(String mainCategory) + { + this.mainCategory = mainCategory; + } + + public long getNrofCategories() + { + return nrofCategories; + } + + public long getNrofDisambiguationPages() + { + return nrofDisambiguationPages; + } + + public long getNrofPages() + { + return nrofPages; + } + + public long getNrofRedirects() + { + return nrofRedirects; + } + + public long getId() + { + return id; + } + + public void setId(long id) + { + this.id = id; + } + + public String getLanguage() + { + return language; + } + + public void setLanguage(String language) + { + this.language = language; + } + + public void setNrofCategories(long nrofCategories) + { + this.nrofCategories = nrofCategories; + } + + public void setNrofDisambiguationPages(long nrofDisambiguationPages) + { + this.nrofDisambiguationPages = nrofDisambiguationPages; + } + + public void setNrofPages(long nrofPages) + { + this.nrofPages = nrofPages; + } + + public void setNrofRedirects(long nrofRedirects) + { + this.nrofRedirects = nrofRedirects; + } + + public String getVersion() + { + return version; + } + + public void setVersion(String version) + { + this.version = version; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java index a9003dda..0d0687f2 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/MetaDataDAO.java @@ -30,47 +30,58 @@ * @author Hibernate Tools * @see org.dkpro.jwpl.api.MetaData */ -public class MetaDataDAO extends GenericDAO<MetaData> implements WikiConstants { +public class MetaDataDAO + extends GenericDAO<MetaData> + implements WikiConstants +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - public MetaDataDAO(Wikipedia wiki) { - super(wiki, MetaData.class); - } + public MetaDataDAO(Wikipedia wiki) + { + super(wiki, MetaData.class); + } - @Override - public void persist(MetaData transientInstance) { - logger.debug("persisting MetaData instance"); - super.persist(transientInstance); - } + @Override + public void persist(MetaData transientInstance) + { + logger.debug("persisting MetaData instance"); + super.persist(transientInstance); + } - @Override - public void attachDirty(MetaData instance) { - logger.debug("attaching dirty MetaData instance"); - super.attachDirty(instance); - } + @Override + public void attachDirty(MetaData instance) + { + logger.debug("attaching dirty MetaData instance"); + super.attachDirty(instance); + } - @Override - public void attachClean(MetaData instance) { - logger.debug("attaching clean MetaData instance"); - super.attachClean(instance); - } + @Override + public void attachClean(MetaData instance) + { + logger.debug("attaching clean MetaData instance"); + super.attachClean(instance); + } - @Override - public void delete(MetaData persistentInstance) { - logger.debug("deleting MetaData instance"); - super.delete(persistentInstance); - } + @Override + public void delete(MetaData persistentInstance) + { + logger.debug("deleting MetaData instance"); + super.delete(persistentInstance); + } - @Override - public MetaData merge(MetaData detachedInstance) { - logger.debug("merging MetaData instance"); - return super.merge(detachedInstance); - } + @Override + public MetaData merge(MetaData detachedInstance) + { + logger.debug("merging MetaData instance"); + return super.merge(detachedInstance); + } - @Override - public MetaData findById(java.lang.Long id) { - logger.debug("getting MetaData instance with id: " + id); - return super.findById(id); - } + @Override + public MetaData findById(java.lang.Long id) + { + logger.debug("getting MetaData instance with id: " + id); + return super.findById(id); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java index 4a25ac2e..aa8f416b 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/Page.java @@ -21,104 +21,125 @@ import java.util.Set; /** - * The page class that is actually persisted by Hibernate. - * It is accessed via a equally named class in the api package to hide session management from the user. + * The page class that is actually persisted by Hibernate. It is accessed via a equally named class + * in the api package to hide session management from the user. */ -public class Page { - - private long id; - private int pageId; - private String name; - private String text; - private boolean isDisambiguation; - private Set<Integer> inLinks = new HashSet<>(); - private Set<Integer> outLinks = new HashSet<>(); - private Set<Integer> categories = new HashSet<>(); - private Set<String> redirects = new HashSet<>(); - - /** - * A no argument constructor as required by Hibernate. - */ - public Page() { - } - - public long getId() { - return id; - } - - @SuppressWarnings("unused") - private void setId(long id) { - this.id = id; - } - - public int getPageId() { - return pageId; - } - - public void setPageId(int pageId) { - this.pageId = pageId; - } - - public Set<Integer> getCategories() { - return categories; - } - - public void setCategories(Set<Integer> categories) { - this.categories = categories; - } - - public Set<Integer> getInLinks() { - return inLinks; - } - - public void setInLinks(Set<Integer> inLinks) { - this.inLinks = inLinks; - } - - public String getName() { - return name; - } - - public void setName(String name) { - this.name = name; - } - - public Set<Integer> getOutLinks() { - return outLinks; - } - - public int getOutDegree() { - return outLinks.size(); - } - - public void setOutLinks(Set<Integer> outLinks) { - this.outLinks = outLinks; - } - - public Set<String> getRedirects() { - return redirects; - } - - public void setRedirects(Set<String> redirects) { - this.redirects = redirects; - } - - public String getText() { - return text; - } - - public void setText(String text) { - this.text = text; - } - - public boolean getIsDisambiguation() { - return isDisambiguation; - } - - public void setIsDisambiguation(Boolean isDisambiguation) { - if (isDisambiguation == null) { - isDisambiguation = false; - } - this.isDisambiguation = isDisambiguation; - } +public class Page +{ + + private long id; + private int pageId; + private String name; + private String text; + private boolean isDisambiguation; + private Set<Integer> inLinks = new HashSet<>(); + private Set<Integer> outLinks = new HashSet<>(); + private Set<Integer> categories = new HashSet<>(); + private Set<String> redirects = new HashSet<>(); + + /** + * A no argument constructor as required by Hibernate. + */ + public Page() + { + } + + public long getId() + { + return id; + } + + @SuppressWarnings("unused") + private void setId(long id) + { + this.id = id; + } + + public int getPageId() + { + return pageId; + } + + public void setPageId(int pageId) + { + this.pageId = pageId; + } + + public Set<Integer> getCategories() + { + return categories; + } + + public void setCategories(Set<Integer> categories) + { + this.categories = categories; + } + + public Set<Integer> getInLinks() + { + return inLinks; + } + + public void setInLinks(Set<Integer> inLinks) + { + this.inLinks = inLinks; + } + + public String getName() + { + return name; + } + + public void setName(String name) + { + this.name = name; + } + + public Set<Integer> getOutLinks() + { + return outLinks; + } + + public int getOutDegree() + { + return outLinks.size(); + } + + public void setOutLinks(Set<Integer> outLinks) + { + this.outLinks = outLinks; + } + + public Set<String> getRedirects() + { + return redirects; + } + + public void setRedirects(Set<String> redirects) + { + this.redirects = redirects; + } + + public String getText() + { + return text; + } + + public void setText(String text) + { + this.text = text; + } + + public boolean getIsDisambiguation() + { + return isDisambiguation; + } + + public void setIsDisambiguation(Boolean isDisambiguation) + { + if (isDisambiguation == null) { + isDisambiguation = false; + } + this.isDisambiguation = isDisambiguation; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java index dfaf4d29..b583aadc 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageDAO.java @@ -29,47 +29,57 @@ * @author Hibernate Tools * @see org.dkpro.jwpl.api.Page */ -public class PageDAO extends GenericDAO<Page> { +public class PageDAO + extends GenericDAO<Page> +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - public PageDAO(Wikipedia pWiki) { - super(pWiki, Page.class); - } + public PageDAO(Wikipedia pWiki) + { + super(pWiki, Page.class); + } - @Override - public void persist(Page transientInstance) { - logger.debug("persisting Page instance"); - super.persist(transientInstance); - } + @Override + public void persist(Page transientInstance) + { + logger.debug("persisting Page instance"); + super.persist(transientInstance); + } - @Override - public void attachDirty(Page instance) { - logger.debug("attaching dirty Page instance"); - super.attachDirty(instance); - } + @Override + public void attachDirty(Page instance) + { + logger.debug("attaching dirty Page instance"); + super.attachDirty(instance); + } - @Override - public void attachClean(Page instance) { - logger.debug("attaching clean Page instance"); - super.attachClean(instance); - } + @Override + public void attachClean(Page instance) + { + logger.debug("attaching clean Page instance"); + super.attachClean(instance); + } - @Override - public void delete(Page persistentInstance) { - logger.debug("deleting Page instance"); - super.delete(persistentInstance); - } + @Override + public void delete(Page persistentInstance) + { + logger.debug("deleting Page instance"); + super.delete(persistentInstance); + } - @Override - public Page merge(Page detachedInstance) { - logger.debug("merging Page instance"); - return super.merge(detachedInstance); - } + @Override + public Page merge(Page detachedInstance) + { + logger.debug("merging Page instance"); + return super.merge(detachedInstance); + } - @Override - public Page findById(java.lang.Long id) { - logger.debug("getting Page instance with id: " + id); - return super.findById(id); - } + @Override + public Page findById(java.lang.Long id) + { + logger.debug("getting Page instance with id: " + id); + return super.findById(id); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java index db933e98..41ff5c78 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/PageMapLine.java @@ -17,56 +17,68 @@ */ package org.dkpro.jwpl.api.hibernate; -public class PageMapLine { - private long id; - private String name; - private int pageID; - private String stem; - private String lemma; +public class PageMapLine +{ + private long id; + private String name; + private int pageID; + private String stem; + private String lemma; - /** - * A no argument constructor as required by Hibernate. - */ - public PageMapLine() { - } + /** + * A no argument constructor as required by Hibernate. + */ + public PageMapLine() + { + } - public long getId() { - return id; - } + public long getId() + { + return id; + } - public void setId(long id) { - this.id = id; - } + public void setId(long id) + { + this.id = id; + } - public String getName() { - return name; - } + public String getName() + { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) + { + this.name = name; + } - public int getPageID() { - return pageID; - } + public int getPageID() + { + return pageID; + } - public void setPageID(int pageID) { - this.pageID = pageID; - } + public void setPageID(int pageID) + { + this.pageID = pageID; + } - public String getLemma() { - return lemma; - } + public String getLemma() + { + return lemma; + } - public void setLemma(String lemma) { - this.lemma = lemma; - } + public void setLemma(String lemma) + { + this.lemma = lemma; + } - public String getStem() { - return stem; - } + public String getStem() + { + return stem; + } - public void setStem(String stem) { - this.stem = stem; - } + public void setStem(String stem) + { + this.stem = stem; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java index 4e310e71..a736e92e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/hibernate/WikiHibernateUtil.java @@ -27,41 +27,48 @@ import org.hibernate.boot.registry.StandardServiceRegistryBuilder; import org.hibernate.cfg.Configuration; -public class WikiHibernateUtil implements WikiConstants { +public class WikiHibernateUtil + implements WikiConstants +{ private static final Map<String, SessionFactory> sessionFactoryMap = new HashMap<>(); - public static SessionFactory getSessionFactory(DatabaseConfiguration config) { + public static SessionFactory getSessionFactory(DatabaseConfiguration config) + { if (config.getLanguage() == null) { - throw new ExceptionInInitializerError("Database configuration error. 'Language' is empty."); + throw new ExceptionInInitializerError( + "Database configuration error. 'Language' is empty."); } else if (config.getHost() == null) { throw new ExceptionInInitializerError("Database configuration error. 'Host' is empty."); } else if (config.getDatabase() == null) { - throw new ExceptionInInitializerError("Database configuration error. 'Database' is empty."); + throw new ExceptionInInitializerError( + "Database configuration error. 'Database' is empty."); } - String uniqueSessionKey = config.getLanguage().toString() + config.getHost() + config.getDatabase(); + String uniqueSessionKey = config.getLanguage().toString() + config.getHost() + + config.getDatabase(); if (!sessionFactoryMap.containsKey(uniqueSessionKey)) { - Configuration configuration = getConfiguration(config); - StandardServiceRegistryBuilder ssrb = new StandardServiceRegistryBuilder().applySettings(configuration.getProperties()); + Configuration configuration = getConfiguration(config); + StandardServiceRegistryBuilder ssrb = new StandardServiceRegistryBuilder() + .applySettings(configuration.getProperties()); SessionFactory sessionFactory = configuration.buildSessionFactory(ssrb.build()); sessionFactoryMap.put(uniqueSessionKey, sessionFactory); } return sessionFactoryMap.get(uniqueSessionKey); } - - private static Properties getProperties(DatabaseConfiguration config) { - String user = config.getUser(); + private static Properties getProperties(DatabaseConfiguration config) + { + String user = config.getUser(); String password = config.getPassword(); /* * Ensures explicit DBMS type specific configuration for hsqldb from junit tests context */ - String jdbcURL = config.getJdbcURL(); + String jdbcURL = config.getJdbcURL(); String databaseDriverClass = config.getDatabaseDriver(); Properties p = new Properties(); @@ -69,81 +76,85 @@ private static Properties getProperties(DatabaseConfiguration config) { boolean useMariaDB = false; boolean useHSQL = false; // XXX other dialects might be interesting here as well... - if(jdbcURL.toLowerCase().contains("mysql")) { + if (jdbcURL.toLowerCase().contains("mysql")) { useMySQL = true; - } else if(jdbcURL.toLowerCase().contains("mariadb")) { + } + else if (jdbcURL.toLowerCase().contains("mariadb")) { useMariaDB = true; - } else if(jdbcURL.toLowerCase().contains("hsql")) { + } + else if (jdbcURL.toLowerCase().contains("hsql")) { useHSQL = true; } // SQL dialect - if(useMySQL) { - p.setProperty("hibernate.dialect","org.hibernate.dialect.MySQLDialect"); - } else if(useMariaDB) { - p.setProperty("hibernate.dialect","org.hibernate.dialect.MariaDBDialect"); - } else if(useHSQL) { - p.setProperty("hibernate.dialect","org.hibernate.dialect.HSQLDialect"); + if (useMySQL) { + p.setProperty("hibernate.dialect", "org.hibernate.dialect.MySQLDialect"); + } + else if (useMariaDB) { + p.setProperty("hibernate.dialect", "org.hibernate.dialect.MariaDBDialect"); + } + else if (useHSQL) { + p.setProperty("hibernate.dialect", "org.hibernate.dialect.HSQLDialect"); } // Database connection settings p.setProperty("hibernate.connection.driver_class", databaseDriverClass); p.setProperty("hibernate.connection.url", jdbcURL); /* - * Needed to ensure working hsqldb queries - don't remove it...! + * Needed to ensure working hsqldb queries - don't remove it...! */ - p.setProperty("hibernate.connection.useUnicode","true"); + p.setProperty("hibernate.connection.useUnicode", "true"); p.setProperty("hibernate.connection.characterEncoding", "UTF-8"); p.setProperty("hibernate.connection.username", user); p.setProperty("hibernate.connection.password", password); // JDBC connection pool (use the built-in) --> - p.setProperty("hibernate.connection.pool_size","5"); + p.setProperty("hibernate.connection.pool_size", "5"); // Enable Hibernate's automatic session context management - p.setProperty("hibernate.current_session_context_class","thread"); + p.setProperty("hibernate.current_session_context_class", "thread"); // Disable the second-level cache - p.setProperty("hibernate.cache.provider_class","org.hibernate.cache.NoCacheProvider"); + p.setProperty("hibernate.cache.provider_class", "org.hibernate.cache.NoCacheProvider"); // Echo all executed SQL to stdout - p.setProperty("hibernate.show_sql","false"); + p.setProperty("hibernate.show_sql", "false"); // Do only update schema on changes - if(useMySQL || useMariaDB) { - p.setProperty("hibernate.hbm2ddl.auto","validate"); + if (useMySQL || useMariaDB) { + p.setProperty("hibernate.hbm2ddl.auto", "validate"); } - if(useHSQL) { - p.setProperty("hibernate.hbm2ddl.auto","none"); + if (useHSQL) { + p.setProperty("hibernate.hbm2ddl.auto", "none"); } // Avoid long running connection acquisition: - // Important performance fix to obtain jdbc connections a lot faster by avoiding metadata fetching - p.setProperty("hibernate.temp.use_jdbc_metadata_defaults","false"); + // Important performance fix to obtain jdbc connections a lot faster by avoiding metadata + // fetching + p.setProperty("hibernate.temp.use_jdbc_metadata_defaults", "false"); - if(useMySQL) { + if (useMySQL) { // Set C3P0 Connection Pool in case somebody wants to use it in production settings - // if no C3P0 is available at runtime, related warnings can be ignored safely as the built-in CP will be used. - p.setProperty("hibernate.c3p0.acquire_increment","3"); - p.setProperty("hibernate.c3p0.idle_test_period","300"); - p.setProperty("hibernate.c3p0.min_size","3"); - p.setProperty("hibernate.c3p0.max_size","15"); - p.setProperty("hibernate.c3p0.max_statements","100"); - p.setProperty("hibernate.c3p0.timeout","1000"); + // if no C3P0 is available at runtime, related warnings can be ignored safely as the + // built-in CP will be used. + p.setProperty("hibernate.c3p0.acquire_increment", "3"); + p.setProperty("hibernate.c3p0.idle_test_period", "300"); + p.setProperty("hibernate.c3p0.min_size", "3"); + p.setProperty("hibernate.c3p0.max_size", "15"); + p.setProperty("hibernate.c3p0.max_statements", "100"); + p.setProperty("hibernate.c3p0.timeout", "1000"); } return p; } - private static Configuration getConfiguration(DatabaseConfiguration config) { - Configuration cfg = new Configuration() - .addClass(Category.class) - .addClass(MetaData.class) - .addClass(Page.class) - .addClass(PageMapLine.class) -// .addClass(RelatednessCacheLine.class) - .addProperties(getProperties(config)); + private static Configuration getConfiguration(DatabaseConfiguration config) + { + Configuration cfg = new Configuration().addClass(Category.class).addClass(MetaData.class) + .addClass(Page.class).addClass(PageMapLine.class) + // .addClass(RelatednessCacheLine.class) + .addProperties(getProperties(config)); return cfg; } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java index 38c4222e..7b810805 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/PlainTextConverter.java @@ -76,521 +76,580 @@ import de.fau.cs.osr.utils.StringTools; /** - * A visitor to convert an article AST into a plain text representation. To - * better understand the visitor pattern as implemented by the Visitor class, - * please take a look at the following resources: + * A visitor to convert an article AST into a plain text representation. To better understand the + * visitor pattern as implemented by the Visitor class, please take a look at the following + * resources: * <ul> - * <li><a href="http://en.wikipedia.org/wiki/Visitor_pattern">Visitor Pattern (classic pattern)</a></li> - * <li><a href="http://www.javaworld.com/javaworld/javatips/jw-javatip98.html">the version we use here</a></li> + * <li><a href="http://en.wikipedia.org/wiki/Visitor_pattern">Visitor Pattern (classic + * pattern)</a></li> + * <li><a href="http://www.javaworld.com/javaworld/javatips/jw-javatip98.html">the version we use + * here</a></li> * </ul> * <p> - * The methods needed to descend into an AST and visit the children of a given - * node <code>n</code> are + * The methods needed to descend into an AST and visit the children of a given node <code>n</code> + * are * <ul> * <li><code>dispatch(n)</code> - visit node <code>n</code>,</li> - * <li><code>iterate(n)</code> - visit the <b>children</b> of node - * <code>n</code>,</li> - * <li><code>map(n)</code> - visit the <b>children</b> of node <code>n</code> - * and gather the return values of the <code>visit()</code> calls in a list,</li> - * <li><code>mapInPlace(n)</code> - visit the <b>children</b> of node - * <code>n</code> and replace each child node <code>c</code> with the return - * value of the call to <code>visit(c)</code>.</li> + * <li><code>iterate(n)</code> - visit the <b>children</b> of node <code>n</code>,</li> + * <li><code>map(n)</code> - visit the <b>children</b> of node <code>n</code> and gather the return + * values of the <code>visit()</code> calls in a list,</li> + * <li><code>mapInPlace(n)</code> - visit the <b>children</b> of node <code>n</code> and replace + * each child node <code>c</code> with the return value of the call to <code>visit(c)</code>.</li> * </ul> * * @author Open Source Research Group, University of Erlangen-Nürnberg */ -public class PlainTextConverter extends AstVisitor<WtNode> { - - private static final Pattern ws = Pattern.compile("\\s+"); - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final WikiConfig config; - - private final int wrapCol; - - private StringBuilder sb; - - private StringBuilder line; - - private boolean pastBod; - - private int needNewlines; - - private boolean needSpace; - private boolean noWrap; - private final boolean enumerateSections; - - private LinkedList<Integer> sections; - - /* Things needed for processing tables */ - private List<List<String>> rows; - private List<String> currentRow; - private StringBuilder currentCell; - private String currentLinkTitleInCell; - - // ========================================================================= - - - /** - * Creates a new visitor that produces a plain text String representation - * of a parsed Wikipedia article. - * s - */ - public PlainTextConverter() { - this(DefaultConfigEnWp.generate(), false, Integer.MAX_VALUE); //no fixed textwidth - } - - /** - * Creates a new visitor that produces a plain text String representation - * of a parsed Wikipedia article. - * - * @param enumerateSection {@code True}, if sections should be enumerated in the output, {@code false} otherwise. - */ - public PlainTextConverter(boolean enumerateSection) { - this(DefaultConfigEnWp.generate(), enumerateSection, Integer.MAX_VALUE); //no fixed textwidth - } - - /** - * Creates a new visitor that produces a plain text String representation - * of a parsed Wikipedia article. - * - * @param config A valid {@link WikiConfig} instance. Must not be {@code null}. - * @param enumerateSections {@code True}, if sections should be enumerated in the output, {@code false} otherwise. - * @param wrapCol Defines the max length of a line. longer lines will be broken. - */ - public PlainTextConverter(WikiConfig config, boolean enumerateSections, int wrapCol) { - this.config = config; - this.wrapCol = wrapCol; - this.enumerateSections = enumerateSections; - } - - @Override - protected WtNode before(WtNode node) { - // This method is called by go() before visitation starts - sb = new StringBuilder(); - line = new StringBuilder(); - pastBod = false; - needNewlines = 0; - needSpace = false; - noWrap = false; - sections = new LinkedList<>(); - rows = new ArrayList<>(); - return super.before(node); - } - - @Override - protected Object after(WtNode node, Object result) { - finishLine(); - - // This method is called by go() after visitation has finished - // The return value will be passed to go() which passes it to the caller - return sb.toString(); - } - - // ========================================================================= - - /* - * We CAN NOT allow this method being implemented here, as it will clash with - * 'visit(de.fau.cs.osr.ptk.common.ast.AstText)' otherwise at runtime. - * Thus, we are ignoring it for now. (see #160) - * - public void visit(WtNode n) - { - // Fallback for all nodes that are not explicitly handled below -// write("<"); -// write(n.getNodeName()); -// write(" />"); - } - */ - - public void visit(WtNodeList n) { - iterate(n); - } - - public void visit(WtPage p) { - iterate(p); - } - - public void visit(AstText text) { - if (currentCell != null) { - // handles table cell content - currentCell.append(text.getContent()); - } else { - // regular case for all nodes that are not explicitly handled below - write(text.getContent()); - } - - } - - public void visit(WtWhitespace w) { - write(" "); - } - - public void visit(WtBold b) { - //write("**"); - iterate(b); - //write("**"); - } - - public void visit(WtItalics i) { - //write("//"); - iterate(i); - //write("//"); - } - - public void visit(WtXmlCharRef cr) { - write(Character.toChars(cr.getCodePoint())); - } - - public void visit(WtXmlEntityRef er) { - - String ch = er.getResolved(); - if (ch == null) { - write('&'); - write(er.getName()); - write(';'); - } else { - write(ch); - } - } - - public void visit(WtUrl url) { - write(url.getProtocol()); - write(':'); - write(url.getPath()); - } - - public void visit(WtExternalLink link) { - //TODO How should we represent external links in the plain text output? - write('['); - iterate(link.getTitle()); - write(']'); - } - - public void visit(WtInternalLink link) { - currentLinkTitleInCell = null; - try { - PageTitle page = PageTitle.make(config, link.getTarget().getAsString()); - if (page.getNamespace().equals(config.getNamespace("Category"))) { - return; - } - } catch (LinkTargetException e) { - logger.warn(e.getLocalizedMessage()); - } - - write(link.getPrefix()); - WtLinkTitle pageTitle = link.getTitle(); - - if (pageTitle == null || pageTitle.isEmpty()) { - // remember this as it could be needed to process table rows correctly - currentLinkTitleInCell = link.getTarget().getAsString(); - if (currentLinkTitleInCell.contains("#")) { - // only take the first part of the string, no anchors on pages (divided by '#' symbols) - currentLinkTitleInCell = currentLinkTitleInCell.split(Pattern.quote("#"), 2)[0]; - } - // for regular cases: just write the original value here - if (currentCell == null) { - write(link.getTarget().getAsString()); - } - } else { - iterate(link.getTitle()); - } - write(link.getPostfix()); - } - - public void visit(WtSection s) { - finishLine(); - StringBuilder saveSb = sb; - boolean saveNoWrap = noWrap; - - sb = new StringBuilder(); - noWrap = true; - - iterate(s.getHeading()); - finishLine(); - String title = sb.toString().trim(); - - sb = saveSb; - - if (s.getLevel() >= 1) { - while (sections.size() > s.getLevel()) { - sections.removeLast(); - } - while (sections.size() < s.getLevel()) { - sections.add(1); - } - - if (enumerateSections) { - StringBuilder sb2 = new StringBuilder(); - for (int i = 0; i < sections.size(); ++i) { - if (i < 1) { - continue; - } - - sb2.append(sections.get(i)); - sb2.append('.'); +public class PlainTextConverter + extends AstVisitor<WtNode> +{ + + private static final Pattern ws = Pattern.compile("\\s+"); + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private final WikiConfig config; + + private final int wrapCol; + + private StringBuilder sb; + + private StringBuilder line; + + private boolean pastBod; + + private int needNewlines; + + private boolean needSpace; + private boolean noWrap; + private final boolean enumerateSections; + + private LinkedList<Integer> sections; + + /* Things needed for processing tables */ + private List<List<String>> rows; + private List<String> currentRow; + private StringBuilder currentCell; + private String currentLinkTitleInCell; + + // ========================================================================= + + /** + * Creates a new visitor that produces a plain text String representation of a parsed Wikipedia + * article. s + */ + public PlainTextConverter() + { + this(DefaultConfigEnWp.generate(), false, Integer.MAX_VALUE); // no fixed textwidth + } + + /** + * Creates a new visitor that produces a plain text String representation of a parsed Wikipedia + * article. + * + * @param enumerateSection + * {@code True}, if sections should be enumerated in the output, {@code false} + * otherwise. + */ + public PlainTextConverter(boolean enumerateSection) + { + this(DefaultConfigEnWp.generate(), enumerateSection, Integer.MAX_VALUE); // no fixed + // textwidth + } + + /** + * Creates a new visitor that produces a plain text String representation of a parsed Wikipedia + * article. + * + * @param config + * A valid {@link WikiConfig} instance. Must not be {@code null}. + * @param enumerateSections + * {@code True}, if sections should be enumerated in the output, {@code false} + * otherwise. + * @param wrapCol + * Defines the max length of a line. longer lines will be broken. + */ + public PlainTextConverter(WikiConfig config, boolean enumerateSections, int wrapCol) + { + this.config = config; + this.wrapCol = wrapCol; + this.enumerateSections = enumerateSections; + } + + @Override + protected WtNode before(WtNode node) + { + // This method is called by go() before visitation starts + sb = new StringBuilder(); + line = new StringBuilder(); + pastBod = false; + needNewlines = 0; + needSpace = false; + noWrap = false; + sections = new LinkedList<>(); + rows = new ArrayList<>(); + return super.before(node); + } + + @Override + protected Object after(WtNode node, Object result) + { + finishLine(); + + // This method is called by go() after visitation has finished + // The return value will be passed to go() which passes it to the caller + return sb.toString(); + } + + // ========================================================================= + + /* + * We CAN NOT allow this method being implemented here, as it will clash with + * 'visit(de.fau.cs.osr.ptk.common.ast.AstText)' otherwise at runtime. Thus, we are ignoring it + * for now. (see #160) + * + * public void visit(WtNode n) { // Fallback for all nodes that are not explicitly handled below + * // write("<"); // write(n.getNodeName()); // write(" />"); } + */ + + public void visit(WtNodeList n) + { + iterate(n); + } + + public void visit(WtPage p) + { + iterate(p); + } + + public void visit(AstText text) + { + if (currentCell != null) { + // handles table cell content + currentCell.append(text.getContent()); + } + else { + // regular case for all nodes that are not explicitly handled below + write(text.getContent()); } - if (sb2.length() > 0) { - sb2.append(' '); + } + + public void visit(WtWhitespace w) + { + write(" "); + } + + public void visit(WtBold b) + { + // write("**"); + iterate(b); + // write("**"); + } + + public void visit(WtItalics i) + { + // write("//"); + iterate(i); + // write("//"); + } + + public void visit(WtXmlCharRef cr) + { + write(Character.toChars(cr.getCodePoint())); + } + + public void visit(WtXmlEntityRef er) + { + + String ch = er.getResolved(); + if (ch == null) { + write('&'); + write(er.getName()); + write(';'); } - sb2.append(title); - title = sb2.toString(); - } - } - - newline(1); - write(title); - newline(1); -// write(StringUtils.strrep('-', title.length())); -// newline(1); - - noWrap = saveNoWrap; - - iterate(s.getBody()); - - while (sections.size() > s.getLevel()) { - sections.removeLast(); - } - sections.add(sections.removeLast() + 1); - } - - public void visit(WtParagraph p) { - iterate(p); - newline(1); - } - - public void visit(WtHorizontalRule hr) { - newline(1); -// write(StringUtils.strrep('-', wrapCol)); -// newline(1); - } - - public void visit(WtXmlElement e) { - if (e.getName().equalsIgnoreCase("br")) { - newline(1); - } else { - iterate(e.getBody()); - } - } - - public void visit(WtXmlEndTag t) { - iterate(t); - } - - public void visit(WtXmlAttribute n) { - // ignore formatting information from xml attributes as the result is expected in plain text - } - - public void visit(WtListItem n) { - iterate(n); - } - - /** - * Called when a {@link WtTable table structure} is about to be processed. - * - * @param n A node representing a table. - */ - public void visit(WtTable n) { - iterate(n); - } - - /** - * Called when an inner {@link WtTableImplicitTableBody table body} is about to be processed. - * - * @param n A node representing a table body. - */ - public void visit(WtTableImplicitTableBody n) { - iterate(n); - } - - /** - * Called when a {@link WtTableCaption table caption} is about to be processed. - * - * @param n A node representing a table caption. - */ - public void visit(WtTableCaption n) { - iterate(n); - } - - /** - * Called when a {@link WtTableRow table row} is about to be processed. - * - * @param n A node representing a table row. - */ - public void visit(WtTableRow n) { - if (currentRow == null) { - currentRow = new ArrayList<>(); - iterate(n); - if (currentRow.size() > 0) { - rows.add(currentRow); - } - if (currentRow.size() == n.getBody().size()) { - StringBuilder tableRowFormatted = new StringBuilder(); - for (int i = 0; i < currentRow.size(); i++) { - tableRowFormatted.append(currentRow.get(i)); - if (i + 1 < currentRow.size()) { - // appending a separator char only in between cells here - tableRowFormatted.append('|'); - } + else { + write(ch); } - writeWord(tableRowFormatted.toString()); - } - currentRow = null; - } - } - - /** - * Called when a header {@link WtTableHeader cell} is about to be processed. - * - * @param n A node representing a table header cell. - */ - public void visit(WtTableHeader n) { - processCellContent(n); - } - - /** - * Called when a regular {@link WtTableCell cell} is about to be processed. - * - * @param n A node representing a table header cell. - */ - public void visit(WtTableCell n) { - processCellContent(n); - } - - private void processCellContent(WtInnerNode2 n) { - if (currentRow != null) { - currentCell = new StringBuilder(); - iterate(n); - String cellValue = currentCell.toString().trim(); - if (currentLinkTitleInCell != null) { - cellValue = currentLinkTitleInCell + " " + cellValue; + } + + public void visit(WtUrl url) + { + write(url.getProtocol()); + write(':'); + write(url.getPath()); + } + + public void visit(WtExternalLink link) + { + // TODO How should we represent external links in the plain text output? + write('['); + iterate(link.getTitle()); + write(']'); + } + + public void visit(WtInternalLink link) + { currentLinkTitleInCell = null; - } - currentRow.add(cellValue); - currentCell = null; + try { + PageTitle page = PageTitle.make(config, link.getTarget().getAsString()); + if (page.getNamespace().equals(config.getNamespace("Category"))) { + return; + } + } + catch (LinkTargetException e) { + logger.warn(e.getLocalizedMessage()); + } + + write(link.getPrefix()); + WtLinkTitle pageTitle = link.getTitle(); + + if (pageTitle == null || pageTitle.isEmpty()) { + // remember this as it could be needed to process table rows correctly + currentLinkTitleInCell = link.getTarget().getAsString(); + if (currentLinkTitleInCell.contains("#")) { + // only take the first part of the string, no anchors on pages (divided by '#' + // symbols) + currentLinkTitleInCell = currentLinkTitleInCell.split(Pattern.quote("#"), 2)[0]; + } + // for regular cases: just write the original value here + if (currentCell == null) { + write(link.getTarget().getAsString()); + } + } + else { + iterate(link.getTitle()); + } + write(link.getPostfix()); } - } + public void visit(WtSection s) + { + finishLine(); + StringBuilder saveSb = sb; + boolean saveNoWrap = noWrap; + + sb = new StringBuilder(); + noWrap = true; + + iterate(s.getHeading()); + finishLine(); + String title = sb.toString().trim(); + + sb = saveSb; + + if (s.getLevel() >= 1) { + while (sections.size() > s.getLevel()) { + sections.removeLast(); + } + while (sections.size() < s.getLevel()) { + sections.add(1); + } + + if (enumerateSections) { + StringBuilder sb2 = new StringBuilder(); + for (int i = 0; i < sections.size(); ++i) { + if (i < 1) { + continue; + } + + sb2.append(sections.get(i)); + sb2.append('.'); + } + + if (sb2.length() > 0) { + sb2.append(' '); + } + sb2.append(title); + title = sb2.toString(); + } + } + + newline(1); + write(title); + newline(1); + // write(StringUtils.strrep('-', title.length())); + // newline(1); - // ========================================================================= - // Stuff we want to hide + noWrap = saveNoWrap; - public void visit(WtImageLink n) { - } + iterate(s.getBody()); + + while (sections.size() > s.getLevel()) { + sections.removeLast(); + } + sections.add(sections.removeLast() + 1); + } + + public void visit(WtParagraph p) + { + iterate(p); + newline(1); + } + + public void visit(WtHorizontalRule hr) + { + newline(1); + // write(StringUtils.strrep('-', wrapCol)); + // newline(1); + } + + public void visit(WtXmlElement e) + { + if (e.getName().equalsIgnoreCase("br")) { + newline(1); + } + else { + iterate(e.getBody()); + } + } - public void visit(WtIllegalCodePoint n) { - } + public void visit(WtXmlEndTag t) + { + iterate(t); + } - public void visit(WtXmlComment n) { - } + public void visit(WtXmlAttribute n) + { + // ignore formatting information from xml attributes as the result is expected in plain text + } - public void visit(WtTemplate n) { - } + public void visit(WtListItem n) + { + iterate(n); + } - public void visit(WtTemplateArgument n) { - } + /** + * Called when a {@link WtTable table structure} is about to be processed. + * + * @param n + * A node representing a table. + */ + public void visit(WtTable n) + { + iterate(n); + } - public void visit(WtTemplateParameter n) { - } + /** + * Called when an inner {@link WtTableImplicitTableBody table body} is about to be processed. + * + * @param n + * A node representing a table body. + */ + public void visit(WtTableImplicitTableBody n) + { + iterate(n); + } - public void visit(WtTagExtension n) { - } + /** + * Called when a {@link WtTableCaption table caption} is about to be processed. + * + * @param n + * A node representing a table caption. + */ + public void visit(WtTableCaption n) + { + iterate(n); + } + /** + * Called when a {@link WtTableRow table row} is about to be processed. + * + * @param n + * A node representing a table row. + */ + public void visit(WtTableRow n) + { + if (currentRow == null) { + currentRow = new ArrayList<>(); + iterate(n); + if (currentRow.size() > 0) { + rows.add(currentRow); + } + if (currentRow.size() == n.getBody().size()) { + StringBuilder tableRowFormatted = new StringBuilder(); + for (int i = 0; i < currentRow.size(); i++) { + tableRowFormatted.append(currentRow.get(i)); + if (i + 1 < currentRow.size()) { + // appending a separator char only in between cells here + tableRowFormatted.append('|'); + } + } + writeWord(tableRowFormatted.toString()); + } + currentRow = null; + } + } - // ========================================================================= + /** + * Called when a header {@link WtTableHeader cell} is about to be processed. + * + * @param n + * A node representing a table header cell. + */ + public void visit(WtTableHeader n) + { + processCellContent(n); + } - private void newline(int num) { - if (pastBod) { - if (num > needNewlines) { - needNewlines = num; - } + /** + * Called when a regular {@link WtTableCell cell} is about to be processed. + * + * @param n + * A node representing a table header cell. + */ + public void visit(WtTableCell n) + { + processCellContent(n); } - } - private void wantSpace() { - if (pastBod) { - needSpace = true; + private void processCellContent(WtInnerNode2 n) + { + if (currentRow != null) { + currentCell = new StringBuilder(); + iterate(n); + String cellValue = currentCell.toString().trim(); + if (currentLinkTitleInCell != null) { + cellValue = currentLinkTitleInCell + " " + cellValue; + currentLinkTitleInCell = null; + } + currentRow.add(cellValue); + currentCell = null; + } } - } - private void finishLine() { - sb.append(line.toString()); - line.setLength(0); - } + // ========================================================================= + // Stuff we want to hide - private void writeNewlines(int num) { - finishLine(); - sb.append(StringTools.strrep('\n', num)); - needNewlines = 0; - needSpace = false; - } + public void visit(WtImageLink n) + { + } + + public void visit(WtIllegalCodePoint n) + { + } - private void writeWord(String s) { - int length = s.length(); - if (length == 0) { - return; + public void visit(WtXmlComment n) + { } - if (!noWrap && needNewlines <= 0) { - if (needSpace) { - length += 1; - } + public void visit(WtTemplate n) + { + } - if (line.length() + length >= wrapCol && line.length() > 0) { - writeNewlines(1); - } + public void visit(WtTemplateArgument n) + { } - if (needSpace && needNewlines <= 0) { - line.append(' '); + public void visit(WtTemplateParameter n) + { } - if (needNewlines > 0) { - writeNewlines(needNewlines); + public void visit(WtTagExtension n) + { } - needSpace = false; - pastBod = true; - line.append(s); - } + // ========================================================================= - private void write(String s) { - if (s.isEmpty()) { - return; + private void newline(int num) + { + if (pastBod) { + if (num > needNewlines) { + needNewlines = num; + } + } } - if (Character.isSpaceChar(s.charAt(0))) { - wantSpace(); + private void wantSpace() + { + if (pastBod) { + needSpace = true; + } } - String[] words = ws.split(s); - for (int i = 0; i < words.length; ) { - writeWord(words[i]); - if (++i < words.length) { - wantSpace(); - } + private void finishLine() + { + sb.append(line.toString()); + line.setLength(0); } - char charAtEnd = s.charAt(s.length() - 1); - if ('\n' == charAtEnd) { - writeNewlines(1); + private void writeNewlines(int num) + { + finishLine(); + sb.append(StringTools.strrep('\n', num)); + needNewlines = 0; + needSpace = false; + } + + private void writeWord(String s) + { + int length = s.length(); + if (length == 0) { + return; + } + + if (!noWrap && needNewlines <= 0) { + if (needSpace) { + length += 1; + } + + if (line.length() + length >= wrapCol && line.length() > 0) { + writeNewlines(1); + } + } + + if (needSpace && needNewlines <= 0) { + line.append(' '); + } + + if (needNewlines > 0) { + writeNewlines(needNewlines); + } + + needSpace = false; + pastBod = true; + line.append(s); } - if (Character.isSpaceChar(charAtEnd)) { - wantSpace(); + + private void write(String s) + { + if (s.isEmpty()) { + return; + } + + if (Character.isSpaceChar(s.charAt(0))) { + wantSpace(); + } + + String[] words = ws.split(s); + for (int i = 0; i < words.length;) { + writeWord(words[i]); + if (++i < words.length) { + wantSpace(); + } + } + + char charAtEnd = s.charAt(s.length() - 1); + if ('\n' == charAtEnd) { + writeNewlines(1); + } + if (Character.isSpaceChar(charAtEnd)) { + wantSpace(); + } } - } - private void write(char[] cs) { - write(String.valueOf(cs)); - } + private void write(char[] cs) + { + write(String.valueOf(cs)); + } - private void write(char ch) { - writeWord(String.valueOf(ch)); - } + private void write(char ch) + { + writeWord(String.valueOf(ch)); + } - private void write(int num) { - writeWord(String.valueOf(num)); - } + private void write(int num) + { + writeWord(String.valueOf(num)); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java index 89881f5b..2f4e1503 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/sweble/TemplateNameExtractor.java @@ -41,66 +41,73 @@ /** * A visitor that extracts template names (no parameters) from an article AST. */ -public class TemplateNameExtractor extends AstVisitor<WtNode> { - private final WikiConfig config; - - - private List<String> templates; - - // ========================================================================= - - - /** - * Creates a new visitor that extracts anchors of internal links from a - * parsed Wikipedia article using the default Sweble config as defined - * in WikiConstants.SWEBLE_CONFIG. - */ - public TemplateNameExtractor() { - this.config = DefaultConfigEnWp.generate(); - } - - /** - * Creates a new visitor that extracts anchors of internal links from a - * parsed Wikipedia article. - * - * @param config the Sweble configuration - */ - public TemplateNameExtractor(WikiConfig config) { - this.config = config; - } - - @Override - protected WtNode before(WtNode node) { - // This method is called by go() before visitation starts - templates = new LinkedList<>(); - return super.before(node); - } - - @Override - protected Object after(WtNode node, Object result) { - return templates; - } - - // ========================================================================= - - public void visit(WtNode n) { - iterate(n); - } - - public void visit(WtTemplate tmpl) throws IOException { - for (AstNode n : tmpl.getName()) { - if (n instanceof AstText) { - add(((AstText) n).getContent()); - } +public class TemplateNameExtractor + extends AstVisitor<WtNode> +{ + private final WikiConfig config; + + private List<String> templates; + + // ========================================================================= + + /** + * Creates a new visitor that extracts anchors of internal links from a parsed Wikipedia article + * using the default Sweble config as defined in WikiConstants.SWEBLE_CONFIG. + */ + public TemplateNameExtractor() + { + this.config = DefaultConfigEnWp.generate(); } - } - private void add(String s) { - s = s.replace("\n", "").replace("\r", ""); - if (s.trim().isEmpty()) { - return; + /** + * Creates a new visitor that extracts anchors of internal links from a parsed Wikipedia + * article. + * + * @param config + * the Sweble configuration + */ + public TemplateNameExtractor(WikiConfig config) + { + this.config = config; + } + + @Override + protected WtNode before(WtNode node) + { + // This method is called by go() before visitation starts + templates = new LinkedList<>(); + return super.before(node); + } + + @Override + protected Object after(WtNode node, Object result) + { + return templates; + } + + // ========================================================================= + + public void visit(WtNode n) + { + iterate(n); + } + + public void visit(WtTemplate tmpl) throws IOException + { + for (AstNode n : tmpl.getName()) { + if (n instanceof AstText) { + add(((AstText) n).getContent()); + } + } + } + + private void add(String s) + { + s = s.replace("\n", "").replace("\r", ""); + if (s.trim().isEmpty()) { + return; + } + templates.add(s); } - templates.add(s); - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java index 5303152c..2e6dca23 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/GraphSerialization.java @@ -33,87 +33,108 @@ * Utility for serializing and deserializing {@link DefaultDirectedGraph} objects, that are<br> * wrapped into {@link SerializableDirectedGraph} objects. */ -public final class GraphSerialization { +public final class GraphSerialization +{ - /** - * This class cannot be instantiated. - */ - private GraphSerialization() { - } + /** + * This class cannot be instantiated. + */ + private GraphSerialization() + { + } - /** - * Serializes the given {@link DefaultDirectedGraph} object to the given location. - * - * @param graph Must not be {@code null}. - * @param location Must not be {@code null} and a valid file path. - * @throws IOException Thrown if errors occurred on the IO level. - */ - public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, String location) throws IOException { - File file = new File(location); - file.createNewFile(); - if (!file.canWrite()) { - throw new IOException("Cannot write to file " + location); + /** + * Serializes the given {@link DefaultDirectedGraph} object to the given location. + * + * @param graph + * Must not be {@code null}. + * @param location + * Must not be {@code null} and a valid file path. + * @throws IOException + * Thrown if errors occurred on the IO level. + */ + public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, String location) + throws IOException + { + File file = new File(location); + file.createNewFile(); + if (!file.canWrite()) { + throw new IOException("Cannot write to file " + location); + } + GraphSerialization.saveGraph(graph, file); } - GraphSerialization.saveGraph(graph, file); - } - /** - * Serializes the given {@link DefaultDirectedGraph} object to the given location. - * - * @param graph Must not be {@code null}. - * @param file Must not be {@code null} and valid {@link File}. - * @throws IOException Thrown if errors occurred on the IO level. - */ - public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, File file) throws IOException { - SerializableDirectedGraph serialGraph = new SerializableDirectedGraph(graph); - BufferedOutputStream fos; - ObjectOutputStream out; - fos = new BufferedOutputStream(new FileOutputStream(file)); - out = new ObjectOutputStream(fos); - out.writeObject(serialGraph); - out.close(); + /** + * Serializes the given {@link DefaultDirectedGraph} object to the given location. + * + * @param graph + * Must not be {@code null}. + * @param file + * Must not be {@code null} and valid {@link File}. + * @throws IOException + * Thrown if errors occurred on the IO level. + */ + public static void saveGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph, File file) + throws IOException + { + SerializableDirectedGraph serialGraph = new SerializableDirectedGraph(graph); + BufferedOutputStream fos; + ObjectOutputStream out; + fos = new BufferedOutputStream(new FileOutputStream(file)); + out = new ObjectOutputStream(fos); + out.writeObject(serialGraph); + out.close(); - } + } - /** - * Deserializes a {@link SerializableDirectedGraph} object that is stored in the - * given location. This method returns the {@link DefaultDirectedGraph} object, that is wrapped - * in the {@link SerializableDirectedGraph}. - * - * @param location Must not be {@code null} and a valid file path. - * @return The {@link DefaultDirectedGraph} object, that is wrapped in the - * {@link SerializableDirectedGraph}. - * @throws IOException Thrown if errors occurred on the IO level. - * @throws ClassNotFoundException Thrown if a class could not be find while deserialization. - */ - public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(String location) - throws IOException, ClassNotFoundException { - File file = new File(location); - if (!file.canWrite()) { - throw new IOException("Cannot read from file " + location); + /** + * Deserializes a {@link SerializableDirectedGraph} object that is stored in the given location. + * This method returns the {@link DefaultDirectedGraph} object, that is wrapped in the + * {@link SerializableDirectedGraph}. + * + * @param location + * Must not be {@code null} and a valid file path. + * @return The {@link DefaultDirectedGraph} object, that is wrapped in the + * {@link SerializableDirectedGraph}. + * @throws IOException + * Thrown if errors occurred on the IO level. + * @throws ClassNotFoundException + * Thrown if a class could not be find while deserialization. + */ + public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(String location) + throws IOException, ClassNotFoundException + { + File file = new File(location); + if (!file.canWrite()) { + throw new IOException("Cannot read from file " + location); + } + return GraphSerialization.loadGraph(file); } - return GraphSerialization.loadGraph(file); - } - /** - * Deserializes a {@link SerializableDirectedGraph} object that is stored in the - * given location. This method returns the {@link DefaultDirectedGraph} object, that is wrapped - * in the {@link SerializableDirectedGraph}. - * - * @param file Must not be {@code null} and valid {@link File}. - * @return The {@link DefaultDirectedGraph} object, that is wrapped in the - * {@link SerializableDirectedGraph}. - * @throws IOException Thrown if errors occurred on the IO level. - * @throws ClassNotFoundException Thrown if a class could not be find while deserialization. - */ - public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(File file) throws IOException, ClassNotFoundException { - SerializableDirectedGraph serialGraph; - BufferedInputStream fin; - ObjectInputStream in; - fin = new BufferedInputStream(new FileInputStream(file)); - in = new ObjectInputStream(fin); - serialGraph = (SerializableDirectedGraph) in.readObject(); - in.close(); - return serialGraph.getGraph(); - } + /** + * Deserializes a {@link SerializableDirectedGraph} object that is stored in the given location. + * This method returns the {@link DefaultDirectedGraph} object, that is wrapped in the + * {@link SerializableDirectedGraph}. + * + * @param file + * Must not be {@code null} and valid {@link File}. + * @return The {@link DefaultDirectedGraph} object, that is wrapped in the + * {@link SerializableDirectedGraph}. + * @throws IOException + * Thrown if errors occurred on the IO level. + * @throws ClassNotFoundException + * Thrown if a class could not be find while deserialization. + */ + public static DefaultDirectedGraph<Integer, DefaultEdge> loadGraph(File file) + throws IOException, ClassNotFoundException + { + SerializableDirectedGraph serialGraph; + BufferedInputStream fin; + ObjectInputStream in; + fin = new BufferedInputStream(new FileInputStream(file)); + in = new ObjectInputStream(fin); + serialGraph = (SerializableDirectedGraph) in.readObject(); + in.close(); + return serialGraph.getGraph(); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java index 65be4715..be7b6c98 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/api/util/SerializableDirectedGraph.java @@ -26,32 +26,36 @@ * Serializable Wrapper for a DirectedGraph object, that has Integer objects as vertices and * {@link DefaultEdge} objects as edges.<br> * - * There is no need in this case to serializable vertices and edges separately, - * because they already implement the interface Serializable. + * There is no need in this case to serializable vertices and edges separately, because they already + * implement the interface Serializable. */ -public final class SerializableDirectedGraph implements Serializable { +public final class SerializableDirectedGraph + implements Serializable +{ - /** - * Generated serial ID. - */ - private static final long serialVersionUID = -192220033577521277L; + /** + * Generated serial ID. + */ + private static final long serialVersionUID = -192220033577521277L; - private final DefaultDirectedGraph<Integer, DefaultEdge> graph; + private final DefaultDirectedGraph<Integer, DefaultEdge> graph; - /** - * This Constructor is intended to be used before the serialization of the <br> - * directed graph. - * - * @param graph - */ - public SerializableDirectedGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph) { - this.graph = graph; - } + /** + * This Constructor is intended to be used before the serialization of the <br> + * directed graph. + * + * @param graph + */ + public SerializableDirectedGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph) + { + this.graph = graph; + } - /** - * @return A {@link DefaultDirectedGraph graph} instance. - */ - public DefaultDirectedGraph<Integer, DefaultEdge> getGraph() { - return graph; - } + /** + * @return A {@link DefaultDirectedGraph graph} instance. + */ + public DefaultDirectedGraph<Integer, DefaultEdge> getGraph() + { + return graph; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java index 0ab784d8..b26ce71b 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/ApiUtilities.java @@ -22,42 +22,54 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class ApiUtilities { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - /** - * DOTS - print progress dots. - * TEXT - print a message with progress in percent. - */ - public enum ProgressInfoMode {DOTS, TEXT} - - /** - * Prints a progress counter. - * - * @param counter Indicates the position in the task. - * @param size Size of the overall task. - * @param step How many parts should the progress counter have? - * @param mode Sets the output mode. - * @param text The text that should be print along with the progress indicator. - */ - public static void printProgressInfo(int counter, int size, int step, ProgressInfoMode mode, String text) { - if (size < step) { - return; +public class ApiUtilities +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** + * DOTS - print progress dots. TEXT - print a message with progress in percent. + */ + public enum ProgressInfoMode + { + DOTS, TEXT } - if (counter % (size / step) == 0) { - double progressPercent = counter * 100 / size; - progressPercent = 1 + Math.round(progressPercent * 100) / 100.0; - if (mode.equals(ApiUtilities.ProgressInfoMode.TEXT)) { - logger.info(text + ": " + progressPercent + " - " + OS.getUsedMemory() + " MB"); - } else if (mode.equals(ApiUtilities.ProgressInfoMode.DOTS)) { - System.out.print("."); - if (progressPercent >= 100) { - System.out.println(); + /** + * Prints a progress counter. + * + * @param counter + * Indicates the position in the task. + * @param size + * Size of the overall task. + * @param step + * How many parts should the progress counter have? + * @param mode + * Sets the output mode. + * @param text + * The text that should be print along with the progress indicator. + */ + public static void printProgressInfo(int counter, int size, int step, ProgressInfoMode mode, + String text) + { + if (size < step) { + return; + } + + if (counter % (size / step) == 0) { + double progressPercent = counter * 100 / size; + progressPercent = 1 + Math.round(progressPercent * 100) / 100.0; + if (mode.equals(ApiUtilities.ProgressInfoMode.TEXT)) { + logger.info(text + ": " + progressPercent + " - " + OS.getUsedMemory() + " MB"); + } + else if (mode.equals(ApiUtilities.ProgressInfoMode.DOTS)) { + System.out.print("."); + if (progressPercent >= 100) { + System.out.println(); + } + } } - } } - } - + } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java index ef4e6674..e2420882 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/CommonUtilities.java @@ -21,49 +21,55 @@ import java.util.Map; import java.util.Set; -public class CommonUtilities { +public class CommonUtilities +{ - /** - * Debug output an internal set structure. - * - * @param s Must not be {@code null}. - * @return The resulting String of the contents of {@code s}. - * @deprecated To be removed without replacement. - */ - @Deprecated(since="2.0.0", forRemoval=true) - public static String getSetContents(Set<?> s) { - Object[] sortedArray = s.toArray(); - Arrays.sort(sortedArray); + /** + * Debug output an internal set structure. + * + * @param s + * Must not be {@code null}. + * @return The resulting String of the contents of {@code s}. + * @deprecated To be removed without replacement. + */ + @Deprecated(since = "2.0.0", forRemoval = true) + public static String getSetContents(Set<?> s) + { + Object[] sortedArray = s.toArray(); + Arrays.sort(sortedArray); - int counter = 0; - int elementsPerRow = 10; - StringBuffer sb = new StringBuffer(1000); - for (Object element : sortedArray) { - sb.append(element.toString()).append(" "); - counter++; - if ((counter % elementsPerRow) == 0) { + int counter = 0; + int elementsPerRow = 10; + StringBuffer sb = new StringBuffer(1000); + for (Object element : sortedArray) { + sb.append(element.toString()).append(" "); + counter++; + if ((counter % elementsPerRow) == 0) { + sb.append(System.getProperty("line.separator")); + } + } sb.append(System.getProperty("line.separator")); - } + return sb.toString(); } - sb.append(System.getProperty("line.separator")); - return sb.toString(); - } - /** - * Debug output an internal map structure as key-value pairs. - * - * @param m Must not be {@code null}. - * @return The resulting String of the contents of {@code m}. - */ - public static String getMapContents(Map<?, ?> m) { - Object[] sortedArray = m.keySet().toArray(); - Arrays.sort(sortedArray); + /** + * Debug output an internal map structure as key-value pairs. + * + * @param m + * Must not be {@code null}. + * @return The resulting String of the contents of {@code m}. + */ + public static String getMapContents(Map<?, ?> m) + { + Object[] sortedArray = m.keySet().toArray(); + Arrays.sort(sortedArray); - StringBuffer sb = new StringBuffer(1000); - for (Object element : sortedArray) { - sb.append(element.toString()).append(" - ").append(m.get(element)).append(System.getProperty("line.separator")); + StringBuffer sb = new StringBuffer(1000); + for (Object element : sortedArray) { + sb.append(element.toString()).append(" - ").append(m.get(element)) + .append(System.getProperty("line.separator")); + } + return sb.toString(); } - return sb.toString(); - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java index 67686561..2e273bd4 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/DbUtilities.java @@ -29,38 +29,43 @@ /** * @deprecated To be removed without replacement. */ -@Deprecated(since="2.0.0", forRemoval=true) -public class DbUtilities { +@Deprecated(since = "2.0.0", forRemoval = true) +public class DbUtilities +{ - private final Connection conn; + private final Connection conn; - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - public DbUtilities(Connection conn) { - this.conn = conn; - } + public DbUtilities(Connection conn) + { + this.conn = conn; + } - public boolean tableExists(String tableName) { + public boolean tableExists(String tableName) + { - try { - DatabaseMetaData dbmd = conn.getMetaData(); + try { + DatabaseMetaData dbmd = conn.getMetaData(); - // Specify the type of object; in this case we want tables - String[] types = {"TABLE"}; + // Specify the type of object; in this case we want tables + String[] types = { "TABLE" }; - // get all table names - ResultSet resultSet = dbmd.getTables(null, null, "%", types); + // get all table names + ResultSet resultSet = dbmd.getTables(null, null, "%", types); - while (resultSet.next()) { - if (resultSet.getString("TABLE_NAME").equals(tableName)) { - return true; + while (resultSet.next()) { + if (resultSet.getString("TABLE_NAME").equals(tableName)) { + return true; + } + } + } + catch (SQLException e) { + logger.error("Table " + tableName + " does not exist.", new RuntimeException(e)); } - } - } catch (SQLException e) { - logger.error("Table " + tableName + " does not exist.", new RuntimeException(e)); - } - return false; - } + return false; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java index 386fc172..a87ce74f 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/GraphUtilities.java @@ -26,62 +26,72 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class GraphUtilities { +public class GraphUtilities +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - /** - * @deprecated Use {@link #getRandomPageSubset(Set, int)} instead. - */ - @Deprecated(since="2.0.0", forRemoval=true) - public static Set<Integer> getRandomPageSubset(Iterable<Page> pages, int pResultSetSize) { - Set<Integer> pageIDs = new HashSet<>(); - while (pages.iterator().hasNext()) { - pageIDs.add(pages.iterator().next().getPageId()); + /** + * @deprecated Use {@link #getRandomPageSubset(Set, int)} instead. + */ + @Deprecated(since = "2.0.0", forRemoval = true) + public static Set<Integer> getRandomPageSubset(Iterable<Page> pages, int pResultSetSize) + { + Set<Integer> pageIDs = new HashSet<>(); + while (pages.iterator().hasNext()) { + pageIDs.add(pages.iterator().next().getPageId()); + } + return getRandomPageSubset(pageIDs, pResultSetSize); } - return getRandomPageSubset(pageIDs, pResultSetSize); - } - /** - * Get a random subset (of size pSize) of the page set passed to the method. - * - * @param pPageIDs The pages. - * @param pResultSetSize The size of the result set. - * @return A random subset of the original page set of the given size or null, if the requested subset size is larger than the original page set. - */ - public static Set<Integer> getRandomPageSubset(Set<Integer> pPageIDs, int pResultSetSize) { + /** + * Get a random subset (of size pSize) of the page set passed to the method. + * + * @param pPageIDs + * The pages. + * @param pResultSetSize + * The size of the result set. + * @return A random subset of the original page set of the given size or null, if the requested + * subset size is larger than the original page set. + */ + public static Set<Integer> getRandomPageSubset(Set<Integer> pPageIDs, int pResultSetSize) + { - Set<Integer> uniqueRandomSet = new HashSet<>(); + Set<Integer> uniqueRandomSet = new HashSet<>(); - if (pPageIDs.size() < pResultSetSize) { - logger.warn("Requested subset size is larger than the original page set size."); - return null; - } + if (pPageIDs.size() < pResultSetSize) { + logger.warn("Requested subset size is larger than the original page set size."); + return null; + } - Random rand = new Random(); + Random rand = new Random(); - Object[] pageIdArray = pPageIDs.toArray(); + Object[] pageIdArray = pPageIDs.toArray(); - // If pSize is quite close to the size of the original pageSet the probability of generating the offset of the last - // missing pageIDs is quite low, with the consequence of unpredictable run-time. - // => if more than the half of pages should be included in the result set, better remove random numbers than adding them - if (pResultSetSize > (pPageIDs.size() / 2)) { - uniqueRandomSet.addAll(pPageIDs); - while (uniqueRandomSet.size() > pResultSetSize) { - int randomOffset = rand.nextInt(pPageIDs.size()); - if (uniqueRandomSet.contains(pageIdArray[randomOffset])) { - uniqueRandomSet.remove(pageIdArray[randomOffset]); + // If pSize is quite close to the size of the original pageSet the probability of generating + // the offset of the last + // missing pageIDs is quite low, with the consequence of unpredictable run-time. + // => if more than the half of pages should be included in the result set, better remove + // random numbers than adding them + if (pResultSetSize > (pPageIDs.size() / 2)) { + uniqueRandomSet.addAll(pPageIDs); + while (uniqueRandomSet.size() > pResultSetSize) { + int randomOffset = rand.nextInt(pPageIDs.size()); + if (uniqueRandomSet.contains(pageIdArray[randomOffset])) { + uniqueRandomSet.remove(pageIdArray[randomOffset]); + } + } } - } - } else { - while (uniqueRandomSet.size() < pResultSetSize) { - int randomOffset = rand.nextInt(pPageIDs.size()); - if (!uniqueRandomSet.contains(pageIdArray[randomOffset])) { - uniqueRandomSet.add((Integer) pageIdArray[randomOffset]); + else { + while (uniqueRandomSet.size() < pResultSetSize) { + int randomOffset = rand.nextInt(pPageIDs.size()); + if (!uniqueRandomSet.contains(pageIdArray[randomOffset])) { + uniqueRandomSet.add((Integer) pageIdArray[randomOffset]); + } + } } - } - } - return uniqueRandomSet; - } + return uniqueRandomSet; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java index 0d8fd3ce..f4c62d01 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/HibernateUtilities.java @@ -28,52 +28,59 @@ /** * @deprecated To be removed without replacement. */ -@Deprecated(since="2.0.0", forRemoval=true) -public class HibernateUtilities implements WikiConstants { +@Deprecated(since = "2.0.0", forRemoval = true) +public class HibernateUtilities + implements WikiConstants +{ - private final DatabaseConfiguration dbConfig; + private final DatabaseConfiguration dbConfig; - public HibernateUtilities(Language pLanguage, DatabaseConfiguration dbConfig) { - this.dbConfig = dbConfig; - } + public HibernateUtilities(Language pLanguage, DatabaseConfiguration dbConfig) + { + this.dbConfig = dbConfig; + } - /** - * Hibernate IDs are needed to load an object from the database. - * Internal references are via pageIDs. - * - * @return A mapping of pageIDs to hibernate IDs. - */ - public Map<Integer, Long> getIdMappingPages() { - Map<Integer, Long> idMapping = new HashMap<>(); + /** + * Hibernate IDs are needed to load an object from the database. Internal references are via + * pageIDs. + * + * @return A mapping of pageIDs to hibernate IDs. + */ + public Map<Integer, Long> getIdMappingPages() + { + Map<Integer, Long> idMapping = new HashMap<>(); - Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select page.id, page.pageId from Page as page").list()) { - Object[] row = (Object[]) o; - // put (pageID, id) - idMapping.put((Integer) row[1], (Long) row[0]); + Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select page.id, page.pageId from Page as page") + .list()) { + Object[] row = (Object[]) o; + // put (pageID, id) + idMapping.put((Integer) row[1], (Long) row[0]); + } + session.getTransaction().commit(); + return idMapping; } - session.getTransaction().commit(); - return idMapping; - } - /** - * Hibernate IDs are needed to load an object from the database. - * Internal references are via pageIDs. - * - * @return A mapping of pageIDs to hibernate IDs. - */ - public Map<Integer, Long> getIdMappingCategories() { - Map<Integer, Long> idMapping = new HashMap<>(); + /** + * Hibernate IDs are needed to load an object from the database. Internal references are via + * pageIDs. + * + * @return A mapping of pageIDs to hibernate IDs. + */ + public Map<Integer, Long> getIdMappingCategories() + { + Map<Integer, Long> idMapping = new HashMap<>(); - Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); - session.beginTransaction(); - for (Object o : session.createQuery("select cat.id, cat.pageId from Category as cat").list()) { - Object[] row = (Object[]) o; - // put (pageID, id) - idMapping.put((Integer) row[1], (Long) row[0]); + Session session = WikiHibernateUtil.getSessionFactory(this.dbConfig).getCurrentSession(); + session.beginTransaction(); + for (Object o : session.createQuery("select cat.id, cat.pageId from Category as cat") + .list()) { + Object[] row = (Object[]) o; + // put (pageID, id) + idMapping.put((Integer) row[1], (Long) row[0]); + } + session.getTransaction().commit(); + return idMapping; } - session.getTransaction().commit(); - return idMapping; - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java index 30c26f23..9b4c9e32 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/OS.java @@ -17,36 +17,40 @@ */ package org.dkpro.jwpl.util; -public class OS { +public class OS +{ - /** - * Tries to determine the type of OS the application is running on. - * At the moment only Windows and Linux are supported. - * - * @return The type of OS the application is running on. Or "unknown" if the system is unknown. - */ - public static String getOsType() { - String osType = "unknown"; - String osName = System.getProperty("os.name"); - if (osName.contains("Windows")) { - osType = "Windows"; - } else if (osName.contains("Linux")) { - osType = "Linux"; + /** + * Tries to determine the type of OS the application is running on. At the moment only Windows + * and Linux are supported. + * + * @return The type of OS the application is running on. Or "unknown" if the system is unknown. + */ + public static String getOsType() + { + String osType = "unknown"; + String osName = System.getProperty("os.name"); + if (osName.contains("Windows")) { + osType = "Windows"; + } + else if (osName.contains("Linux")) { + osType = "Linux"; + } + return osType; } - return osType; - } - /** - * Gets the memory used by the JVM in MB. - * - * @return Returns how much memory (in MB) is used by the JVM at the moment. - */ - public static double getUsedMemory() { - Runtime rt = Runtime.getRuntime(); + /** + * Gets the memory used by the JVM in MB. + * + * @return Returns how much memory (in MB) is used by the JVM at the moment. + */ + public static double getUsedMemory() + { + Runtime rt = Runtime.getRuntime(); - long memLong = rt.totalMemory() - rt.freeMemory(); - double memDouble = memLong / (1024.0 * 1024.0); - memDouble = Math.round(memDouble * 100) / 100.0; - return memDouble; - } + long memLong = rt.totalMemory() - rt.freeMemory(); + double memDouble = memLong / (1024.0 * 1024.0); + memDouble = Math.round(memDouble * 100) / 100.0; + return memDouble; + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java index a09b8cd5..e2d16f7e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/StringUtils.java @@ -20,83 +20,89 @@ import java.util.Collection; import java.util.Iterator; -public class StringUtils { +public class StringUtils +{ - private static final StringBuilder buffer = new StringBuilder(10_000_000); + private static final StringBuilder buffer = new StringBuilder(10_000_000); - /** - * Joins the elements of a collection into a string. - * - * @param c The collection which elements should be joined. - * @param delimiter String that is introduced between two joined elements. - * @return The joined string. - */ - public static String join(Collection<?> c, String delimiter) { - buffer.setLength(0); - Iterator<?> iter = c.iterator(); - while (iter.hasNext()) { - buffer.append(iter.next()); - if (iter.hasNext()) { - buffer.append(delimiter); - } + /** + * Joins the elements of a collection into a string. + * + * @param c + * The collection which elements should be joined. + * @param delimiter + * String that is introduced between two joined elements. + * @return The joined string. + */ + public static String join(Collection<?> c, String delimiter) + { + buffer.setLength(0); + Iterator<?> iter = c.iterator(); + while (iter.hasNext()) { + buffer.append(iter.next()); + if (iter.hasNext()) { + buffer.append(delimiter); + } + } + return buffer.toString(); } - return buffer.toString(); - } - /** - * Replaces all problematic characters from a String with their escaped - * versions to make it SQL conform. - * - * @param str unescaped String - * @return SQL safe escaped String - */ - public static String sqlEscape(String str) { - final int len = str.length(); - buffer.setLength(0); - StringBuilder sql = buffer; + /** + * Replaces all problematic characters from a String with their escaped versions to make it SQL + * conform. + * + * @param str + * unescaped String + * @return SQL safe escaped String + */ + public static String sqlEscape(String str) + { + final int len = str.length(); + buffer.setLength(0); + StringBuilder sql = buffer; - for (int i = 0; i < len; i++) { - char c = str.charAt(i); - switch (c) { - case '\u0000': - sql.append('\\').append('0'); - break; - case '\n': - sql.append('\\').append('n'); - break; - case '\t': - sql.append('\\').append('t'); - break; - case '\r': - sql.append('\\').append('r'); - break; - case '\u001a': - sql.append('\\').append('Z'); - break; - case '\'': - sql.append('\\').append('\''); - break; - case '\"': - sql.append('\\').append('"'); - break; - case '\b': - sql.append('\\').append('b'); - break; - case '\\': - sql.append('\\').append('\\'); - break; - // case '%': - // sql.append('[').append('%').append(']'); - // break; - // case '_': - // sql.append('[').append('_').append(']'); - // break; - default: - sql.append(c); - break; - } + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + switch (c) { + case '\u0000': + sql.append('\\').append('0'); + break; + case '\n': + sql.append('\\').append('n'); + break; + case '\t': + sql.append('\\').append('t'); + break; + case '\r': + sql.append('\\').append('r'); + break; + case '\u001a': + sql.append('\\').append('Z'); + break; + case '\'': + sql.append('\\').append('\''); + break; + case '\"': + sql.append('\\').append('"'); + break; + case '\b': + sql.append('\\').append('b'); + break; + case '\\': + sql.append('\\').append('\\'); + break; + // case '%': + // sql.append('[').append('%').append(']'); + // break; + // case '_': + // sql.append('[').append('_').append(']'); + // break; + default: + sql.append(c); + break; + } + } + return sql.toString(); } - return sql.toString(); - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java index 626d3fc8..b25200ad 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/UnmodifiableArraySet.java @@ -22,99 +22,116 @@ import java.util.Iterator; import java.util.Set; -public class UnmodifiableArraySet<E> implements Set<E> { - private final Object[] data; - - public UnmodifiableArraySet(E[] aData) { - data = new Object[aData.length]; - System.arraycopy(aData, 0, data, 0, data.length); - } - - public UnmodifiableArraySet(Set<E> aData) { - data = new Object[aData.size()]; - System.arraycopy(aData.toArray(), 0, data, 0, data.length); - } - - @Override - public int size() { - return data != null ? data.length : 0; - } - - @Override - public boolean isEmpty() { - return data != null ? data.length > 0 : true; - } - - @Override - public boolean contains(Object aO) { - if (data == null) { - return false; +public class UnmodifiableArraySet<E> + implements Set<E> +{ + private final Object[] data; + + public UnmodifiableArraySet(E[] aData) + { + data = new Object[aData.length]; + System.arraycopy(aData, 0, data, 0, data.length); } - for (Object d : data) { - if (d.equals(aO)) { - return true; - } + + public UnmodifiableArraySet(Set<E> aData) + { + data = new Object[aData.size()]; + System.arraycopy(aData.toArray(), 0, data, 0, data.length); } - return false; - } - - @SuppressWarnings("unchecked") - @Override - public Iterator<E> iterator() { - return (Iterator<E>) Arrays.asList(data).iterator(); - } - - @Override - public Object[] toArray() { - return data; - } - - @Override - public <T> T[] toArray(T[] aA) { - if (aA.length != data.length) { - throw new IllegalArgumentException("Target array too small"); + + @Override + public int size() + { + return data != null ? data.length : 0; } - System.arraycopy(data, 0, aA, 0, aA.length); - return aA; - } - - @Override - public boolean add(E aE) { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean remove(Object aO) { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean containsAll(Collection<?> aC) { - for (Object o : aC) { - if (!contains(o)) { + + @Override + public boolean isEmpty() + { + return data != null ? data.length > 0 : true; + } + + @Override + public boolean contains(Object aO) + { + if (data == null) { + return false; + } + for (Object d : data) { + if (d.equals(aO)) { + return true; + } + } return false; - } } - return true; - } - - @Override - public boolean addAll(Collection<? extends E> aC) { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean retainAll(Collection<?> aC) { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public boolean removeAll(Collection<?> aC) { - throw new UnsupportedOperationException("Unmodifiable set"); - } - - @Override - public void clear() { - throw new UnsupportedOperationException("Unmodifiable set"); - } + + @SuppressWarnings("unchecked") + @Override + public Iterator<E> iterator() + { + return (Iterator<E>) Arrays.asList(data).iterator(); + } + + @Override + public Object[] toArray() + { + return data; + } + + @Override + public <T> T[] toArray(T[] aA) + { + if (aA.length != data.length) { + throw new IllegalArgumentException("Target array too small"); + } + System.arraycopy(data, 0, aA, 0, aA.length); + return aA; + } + + @Override + public boolean add(E aE) + { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean remove(Object aO) + { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean containsAll(Collection<?> aC) + { + for (Object o : aC) { + if (!contains(o)) { + return false; + } + } + return true; + } + + @Override + public boolean addAll(Collection<? extends E> aC) + { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean retainAll(Collection<?> aC) + { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public boolean removeAll(Collection<?> aC) + { + throw new UnsupportedOperationException("Unmodifiable set"); + } + + @Override + public void clear() + { + throw new UnsupportedOperationException("Unmodifiable set"); + } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java index 5c6009cd..be32e77e 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/LevenshteinStringDistance.java @@ -17,67 +17,71 @@ */ package org.dkpro.jwpl.util.distance; -public class LevenshteinStringDistance implements StringDistance { +public class LevenshteinStringDistance + implements StringDistance +{ - @Override - public double distance(String s, String t) { - int[][] d; // matrix - int n; // length of s - int m; // length of t - int i; // iterates through s - int j; // iterates through t - char s_i; // ith character of s - char t_j; // jth character of t - int cost; // cost + @Override + public double distance(String s, String t) + { + int[][] d; // matrix + int n; // length of s + int m; // length of t + int i; // iterates through s + int j; // iterates through t + char s_i; // ith character of s + char t_j; // jth character of t + int cost; // cost - // Step 1 - n = s.length(); - m = t.length(); - if (n == 0) { - return m; - } - if (m == 0) { - return n; - } - d = new int[n + 1][m + 1]; + // Step 1 + n = s.length(); + m = t.length(); + if (n == 0) { + return m; + } + if (m == 0) { + return n; + } + d = new int[n + 1][m + 1]; - // Step 2 - for (i = 0; i <= n; i++) { - d[i][0] = i; - } - for (j = 0; j <= m; j++) { - d[0][j] = j; - } - // Step 3 - for (i = 1; i <= n; i++) { - s_i = s.charAt(i - 1); - // Step 4 - for (j = 1; j <= m; j++) { - t_j = t.charAt(j - 1); - // Step 5 - if (s_i == t_j) { - cost = 0; - } else { - cost = 1; + // Step 2 + for (i = 0; i <= n; i++) { + d[i][0] = i; } - // Step 6 - d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, - d[i - 1][j - 1] + cost); - } + for (j = 0; j <= m; j++) { + d[0][j] = j; + } + // Step 3 + for (i = 1; i <= n; i++) { + s_i = s.charAt(i - 1); + // Step 4 + for (j = 1; j <= m; j++) { + t_j = t.charAt(j - 1); + // Step 5 + if (s_i == t_j) { + cost = 0; + } + else { + cost = 1; + } + // Step 6 + d[i][j] = Minimum(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); + } + } + // Step 7 + return Integer.valueOf(d[n][m]).doubleValue(); } - // Step 7 - return Integer.valueOf(d[n][m]).doubleValue(); - } - private int Minimum(int a, int b, int c) { - int min; - min = a; - if (b < min) { - min = b; - } - if (c < min) { - min = c; + private int Minimum(int a, int b, int c) + { + int min; + min = a; + if (b < min) { + min = b; + } + if (c < min) { + min = c; + } + return min; } - return min; - } } diff --git a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java index c538a265..c5cf6cb8 100644 --- a/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java +++ b/dkpro-jwpl-api/src/main/java/org/dkpro/jwpl/util/distance/StringDistance.java @@ -17,8 +17,9 @@ */ package org.dkpro.jwpl.util.distance; -public interface StringDistance { +public interface StringDistance +{ - double distance(String s1, String s2); + double distance(String s1, String s2); } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/BaseJWPLTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/BaseJWPLTest.java index 82b8f75d..5080bbba 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/BaseJWPLTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/BaseJWPLTest.java @@ -18,25 +18,26 @@ package org.dkpro.jwpl.api; /** - * Simple test base class to inject the same hsqldb test context into every test - * class to avoid duplicated code and efforts. Also shuts down the - * hibernate/hsqldb context properly. + * Simple test base class to inject the same hsqldb test context into every test class to avoid + * duplicated code and efforts. Also shuts down the hibernate/hsqldb context properly. * * @author mawiesne */ -public abstract class BaseJWPLTest { +public abstract class BaseJWPLTest +{ - protected static Wikipedia wiki; + protected static Wikipedia wiki; - protected static DatabaseConfiguration obtainHSDLDBConfiguration() { - DatabaseConfiguration db = new DatabaseConfiguration(); - db.setDatabase("wikiapi_test"); - db.setHost("localhost"); - db.setUser("sa"); - db.setPassword(""); - db.setLanguage(WikiConstants.Language._test); - db.setJdbcURL("jdbc:hsqldb:file:./src/test/resources/db/wikiapi_test"); - db.setDatabaseDriver("org.hsqldb.jdbcDriver"); - return db; - } + protected static DatabaseConfiguration obtainHSDLDBConfiguration() + { + DatabaseConfiguration db = new DatabaseConfiguration(); + db.setDatabase("wikiapi_test"); + db.setHost("localhost"); + db.setUser("sa"); + db.setPassword(""); + db.setLanguage(WikiConstants.Language._test); + db.setJdbcURL("jdbc:hsqldb:file:./src/test/resources/db/wikiapi_test"); + db.setDatabaseDriver("org.hsqldb.jdbcDriver"); + return db; + } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java index b315516a..5888c39f 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryDescendantsIteratorTest.java @@ -28,40 +28,43 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class CategoryDescendantsIteratorTest extends BaseJWPLTest{ +public class CategoryDescendantsIteratorTest + extends BaseJWPLTest +{ /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions */ @BeforeAll - public static void setupWikipedia() { + public static void setupWikipedia() + { DatabaseConfiguration db = obtainHSDLDBConfiguration(); try { wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: "+e.getLocalizedMessage()); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); } } - /** - * The category UKP has 9 descendants with pageIds 7-15. - */ + * The category UKP has 9 descendants with pageIds 7-15. + */ @Test - public void test_categoryIteratorTest() { + public void test_categoryIteratorTest() + { Category cat = null; try { cat = wiki.getCategory("UKP"); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { e.printStackTrace(); fail("A WikiApiException occurred while getting the category 'UKP'"); } - List<Integer> expectedPageIds = new ArrayList<>(); expectedPageIds.add(7); expectedPageIds.add(8); @@ -74,7 +77,7 @@ public void test_categoryIteratorTest() { expectedPageIds.add(15); List<Integer> isIds = new ArrayList<>(); - for(Category descendant : cat.getDescendants()) { + for (Category descendant : cat.getDescendants()) { isIds.add(descendant.getPageId()); } Collections.sort(expectedPageIds); @@ -86,17 +89,18 @@ public void test_categoryIteratorTest() { * The category UKP has 9 descendants with pageIds 7-15. */ @Test - public void test_categoryIteratorTestBufferSize() { + public void test_categoryIteratorTestBufferSize() + { Category cat = null; try { cat = wiki.getCategory("UKP"); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { e.printStackTrace(); fail("A WikiApiException occurred while getting the category 'UKP'"); } - List<Integer> expectedPageIds = new ArrayList<>(); expectedPageIds.add(7); expectedPageIds.add(8); @@ -108,15 +112,15 @@ public void test_categoryIteratorTestBufferSize() { expectedPageIds.add(14); expectedPageIds.add(15); - for (int bufferSize=1;bufferSize<=100;bufferSize+=5) { - List<Integer> isIds = new ArrayList<>(); - for(Category descendant : cat.getDescendants(bufferSize)) { - isIds.add(descendant.getPageId()); - } - Collections.sort(expectedPageIds); - Collections.sort(isIds); - assertEquals(expectedPageIds, isIds, "descendants"); + for (int bufferSize = 1; bufferSize <= 100; bufferSize += 5) { + List<Integer> isIds = new ArrayList<>(); + for (Category descendant : cat.getDescendants(bufferSize)) { + isIds.add(descendant.getPageId()); + } + Collections.sort(expectedPageIds); + Collections.sort(isIds); + assertEquals(expectedPageIds, isIds, "descendants"); + } } - } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java index 93c419c2..e39bd21c 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryGraphTest.java @@ -27,45 +27,52 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class CategoryGraphTest extends BaseJWPLTest{ +public class CategoryGraphTest + extends BaseJWPLTest +{ - private static CategoryGraph catGraph; + private static CategoryGraph catGraph; /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: "+e.getLocalizedMessage()); - } + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } try { catGraph = CategoryGraphManager.getCategoryGraph(wiki, false); - } catch (WikiApiException e) { - fail("CategoryGraph could not be initialized: "+e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("CategoryGraph could not be initialized: " + e.getLocalizedMessage()); } } - @Test - public void testDepth(){ + @Test + public void testDepth() + { try { double depth = catGraph.getDepth(); assertEquals(4, depth, 0.00001); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("Getting depth of the CategoryGraph throws exception."); } } @Test - public void testGetPathLength() throws WikiApiException{ + public void testGetPathLength() throws WikiApiException + { String catString = "UKP"; String neighborCatString = "Projects of UKP"; String twoStepsAwayCatString = "SIR"; @@ -93,8 +100,9 @@ public void testGetPathLength() throws WikiApiException{ @Test // each value within the map must be higher than the number of nodes in the category graph - public void testHyponymCountMap() throws WikiApiException{ - Map<Integer,Integer> hyponymCountMap = catGraph.getHyponymCountMap(); + public void testHyponymCountMap() throws WikiApiException + { + Map<Integer, Integer> hyponymCountMap = catGraph.getHyponymCountMap(); int numberOfNodes = catGraph.getNumberOfNodes(); for (Integer key : hyponymCountMap.keySet()) { assertTrue(hyponymCountMap.get(key) < numberOfNodes); @@ -105,20 +113,20 @@ public void testHyponymCountMap() throws WikiApiException{ } assertEquals(16, hyponymCountMap.get(1).intValue()); - assertEquals(0, hyponymCountMap.get(2).intValue()); + assertEquals(0, hyponymCountMap.get(2).intValue()); assertEquals(10, hyponymCountMap.get(3).intValue()); - assertEquals(1, hyponymCountMap.get(4).intValue()); - assertEquals(5, hyponymCountMap.get(5).intValue()); - assertEquals(9, hyponymCountMap.get(6).intValue()); - assertEquals(2, hyponymCountMap.get(7).intValue()); - assertEquals(4, hyponymCountMap.get(8).intValue()); - assertEquals(0, hyponymCountMap.get(9).intValue()); - assertEquals(0, hyponymCountMap.get(10).intValue()); - assertEquals(0, hyponymCountMap.get(11).intValue()); - assertEquals(0, hyponymCountMap.get(12).intValue()); - assertEquals(0, hyponymCountMap.get(13).intValue()); - assertEquals(0, hyponymCountMap.get(14).intValue()); - assertEquals(0, hyponymCountMap.get(15).intValue()); - assertEquals(0, hyponymCountMap.get(200).intValue()); + assertEquals(1, hyponymCountMap.get(4).intValue()); + assertEquals(5, hyponymCountMap.get(5).intValue()); + assertEquals(9, hyponymCountMap.get(6).intValue()); + assertEquals(2, hyponymCountMap.get(7).intValue()); + assertEquals(4, hyponymCountMap.get(8).intValue()); + assertEquals(0, hyponymCountMap.get(9).intValue()); + assertEquals(0, hyponymCountMap.get(10).intValue()); + assertEquals(0, hyponymCountMap.get(11).intValue()); + assertEquals(0, hyponymCountMap.get(12).intValue()); + assertEquals(0, hyponymCountMap.get(13).intValue()); + assertEquals(0, hyponymCountMap.get(14).intValue()); + assertEquals(0, hyponymCountMap.get(15).intValue()); + assertEquals(0, hyponymCountMap.get(200).intValue()); } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java index f47ee279..b5d3a72a 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryIteratorTest.java @@ -25,53 +25,58 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class CategoryIteratorTest extends BaseJWPLTest { +public class CategoryIteratorTest + extends BaseJWPLTest +{ - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: "+e.getLocalizedMessage()); - } - } + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + } - /** - * The test wikipedia contains 17 categories. - */ - @Test - public void test_categoryIteratorTest() { - int nrOfPages = 0; + /** + * The test wikipedia contains 17 categories. + */ + @Test + public void test_categoryIteratorTest() + { + int nrOfPages = 0; - for (Category c : wiki.getCategories()) { - nrOfPages++; - } - assertEquals(17, nrOfPages, "Number of categories == 17"); + for (Category c : wiki.getCategories()) { + nrOfPages++; + } + assertEquals(17, nrOfPages, "Number of categories == 17"); - } + } - /** - * The test wikipedia contains 17 categories. - */ - @Test - public void test_categoryIteratorTestBufferSize() { + /** + * The test wikipedia contains 17 categories. + */ + @Test + public void test_categoryIteratorTestBufferSize() + { - for (int bufferSize=1;bufferSize<=100;bufferSize+=5) { - Iterator<Category> catIter = wiki.getCategories(bufferSize).iterator(); - int nrOfPages = 0; - while (catIter.hasNext()) { - @SuppressWarnings("unused") - Category c = catIter.next(); - nrOfPages++; - } - assertEquals(17, nrOfPages, "Number of categories == 17"); - } - } + for (int bufferSize = 1; bufferSize <= 100; bufferSize += 5) { + Iterator<Category> catIter = wiki.getCategories(bufferSize).iterator(); + int nrOfPages = 0; + while (catIter.hasNext()) { + @SuppressWarnings("unused") + Category c = catIter.next(); + nrOfPages++; + } + assertEquals(17, nrOfPages, "Number of categories == 17"); + } + } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java index 0e13f7f4..7572cf43 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/CategoryTest.java @@ -36,82 +36,97 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class CategoryTest extends BaseJWPLTest { +public class CategoryTest + extends BaseJWPLTest +{ private static final String A_FAMOUS_CATEGORY = "People of UKP"; private static final int A_FAMOUS_PAGE_ID = 8; // Here: ORMs internal object identifier aka Primary Key. private static final long A_FAMOUS_PAGE_OBJECT_ID = 4; - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); - - try { - wiki = new Wikipedia(db); - } catch (Exception e) { + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCategoryTitle(){ - Category cat; - try { - cat = wiki.getCategory(A_FAMOUS_CATEGORY); - assertNotNull(cat); - assertEquals("People of UKP", cat.getTitle().toString(), "testing the title"); - } catch (WikiTitleParsingException e) { - fail("A WikiTitleParsingException occurred while testing the title of the category 'People of UKP': " + e.getLocalizedMessage()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); - } - } - - @Test - public void testCategoryPageId(){ - Category cat; - try { - cat = wiki.getCategory(A_FAMOUS_CATEGORY); - assertNotNull(cat); - assertEquals(8, cat.getPageId(), "testing the pageId"); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); - } - //test the pageId - } - - @Test - public void testCategoryParents(){ - Category cat; - try { - cat = wiki.getCategory(A_FAMOUS_CATEGORY); - assertNotNull(cat); - //test the parents - List<Integer> expectedPageIds = new ArrayList<>(); - expectedPageIds.add(5); - expectedPageIds.add(6); - - List<Integer> isIds = new ArrayList<>(); - for(Category parent : cat.getParents()) { - isIds.add(parent.getPageId()); - } - Collections.sort(expectedPageIds); - Collections.sort(isIds); - assertEquals(expectedPageIds, isIds, "parents"); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); - } - } + } + } @Test - public void testNumberOfCategoryParents(){ + public void testCategoryTitle() + { + Category cat; + try { + cat = wiki.getCategory(A_FAMOUS_CATEGORY); + assertNotNull(cat); + assertEquals("People of UKP", cat.getTitle().toString(), "testing the title"); + } + catch (WikiTitleParsingException e) { + fail("A WikiTitleParsingException occurred while testing the title of the category 'People of UKP': " + + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); + } + } + + @Test + public void testCategoryPageId() + { + Category cat; + try { + cat = wiki.getCategory(A_FAMOUS_CATEGORY); + assertNotNull(cat); + assertEquals(8, cat.getPageId(), "testing the pageId"); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); + } + // test the pageId + } + + @Test + public void testCategoryParents() + { + Category cat; + try { + cat = wiki.getCategory(A_FAMOUS_CATEGORY); + assertNotNull(cat); + // test the parents + List<Integer> expectedPageIds = new ArrayList<>(); + expectedPageIds.add(5); + expectedPageIds.add(6); + + List<Integer> isIds = new ArrayList<>(); + for (Category parent : cat.getParents()) { + isIds.add(parent.getPageId()); + } + Collections.sort(expectedPageIds); + Collections.sort(isIds); + assertEquals(expectedPageIds, isIds, "parents"); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); + } + } + + @Test + public void testNumberOfCategoryParents() + { Category cat; try { cat = wiki.getCategory(A_FAMOUS_CATEGORY); @@ -119,19 +134,22 @@ public void testNumberOfCategoryParents(){ int numberOfParents = cat.getNumberOfParents(); // expecting IDs: 5 and 6 to make up for 2 parent categories assertEquals(2, numberOfParents); - - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); + + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); } } @Test - public void testCategoryDescendants(){ + public void testCategoryDescendants() + { Category cat; try { cat = wiki.getCategory("UKP"); - assertNotNull(cat); - //test the descendants + assertNotNull(cat); + // test the descendants List<Integer> expectedPageIds = new ArrayList<>(); expectedPageIds.add(7); expectedPageIds.add(8); @@ -143,43 +161,49 @@ public void testCategoryDescendants(){ expectedPageIds.add(14); expectedPageIds.add(15); List<Integer> isIds = new ArrayList<>(); - for(Category descendant : cat.getDescendants()) { + for (Category descendant : cat.getDescendants()) { isIds.add(descendant.getPageId()); } Collections.sort(expectedPageIds); Collections.sort(isIds); assertEquals(expectedPageIds, isIds, "descendants"); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'UKP': " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'UKP': " + + e.getLocalizedMessage()); } } @Test - public void testCategoryChildren(){ - Category cat; - try { - cat = wiki.getCategory(A_FAMOUS_CATEGORY); + public void testCategoryChildren() + { + Category cat; + try { + cat = wiki.getCategory(A_FAMOUS_CATEGORY); assertNotNull(cat); List<Integer> expectedPageIds = new ArrayList<>(); List<Integer> isIds = new ArrayList<>(); - //test the children + // test the children expectedPageIds.add(13); expectedPageIds.add(12); expectedPageIds.add(15); expectedPageIds.add(14); - for(Category child : cat.getChildren()) { + for (Category child : cat.getChildren()) { isIds.add(child.getPageId()); } Collections.sort(expectedPageIds); Collections.sort(isIds); assertEquals(expectedPageIds, isIds, "children"); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); - } - } + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); + } + } @Test - public void testNumberOfCategoryChildren(){ + public void testNumberOfCategoryChildren() + { Category cat; try { cat = wiki.getCategory(A_FAMOUS_CATEGORY); @@ -188,16 +212,19 @@ public void testNumberOfCategoryChildren(){ int expectedNumberOfChildren = cat.getNumberOfChildren(); assertEquals(4, expectedNumberOfChildren); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); } } - @Test - public void testCategoryPages(){ - Category cat; - try { - cat = wiki.getCategory("UKP"); + @Test + public void testCategoryPages() + { + Category cat; + try { + cat = wiki.getCategory("UKP"); assertNotNull(cat); List<Integer> expectedPageIds = new ArrayList<>(); expectedPageIds.add(1010); @@ -207,22 +234,27 @@ public void testCategoryPages(){ Set<Page> pages = cat.getArticles(); assertNotNull(pages); assertFalse(pages.isEmpty()); - for(Page p : pages) { + for (Page p : pages) { isIds.add(p.getPageId()); } - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the pages of the category 'People of UKP' for testing: " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the pages of the category 'People of UKP' for testing: " + + e.getLocalizedMessage()); } Collections.sort(expectedPageIds); Collections.sort(isIds); assertEquals(expectedPageIds, isIds, "page"); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage(), e); - } - } + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage(), e); + } + } @Test - public void testNumberOfCategoryPages(){ + public void testNumberOfCategoryPages() + { Category cat; try { cat = wiki.getCategory("UKP"); @@ -230,135 +262,162 @@ public void testNumberOfCategoryPages(){ int numberOfPages = cat.getNumberOfPages(); assertTrue(numberOfPages > 0); assertEquals(2, numberOfPages); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage()); } } @Test - public void testCreateCategoryByPageID() { + public void testCreateCategoryByPageID() + { try { Category p = new Category(wiki, A_FAMOUS_PAGE_ID); assertNotNull(p); assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); } } @Test - public void testCreateCategoryByPageIDInvalid() { + public void testCreateCategoryByPageIDInvalid() + { try { new Category(wiki, -42); - } catch (WikiPageNotFoundException pnfe) { + } + catch (WikiPageNotFoundException pnfe) { // this is expected behavior here, provoked by the test } } @Test - public void testCreateCategoryByObjectID() { + public void testCreateCategoryByObjectID() + { try { Category p = new Category(wiki, A_FAMOUS_PAGE_OBJECT_ID); assertNotNull(p); assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); assertEquals(A_FAMOUS_PAGE_OBJECT_ID, p.__getId()); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); } } @Test - public void testCreateCategoryByObjectIDInvalid() { + public void testCreateCategoryByObjectIDInvalid() + { try { long invalidObjectID = -42L; new Category(wiki, invalidObjectID); - } catch (WikiPageNotFoundException pnfe) { + } + catch (WikiPageNotFoundException pnfe) { // this is expected behavior here, provoked by the test } } @Test - public void testCreateCategoryByName() { + public void testCreateCategoryByName() + { try { Category p = new Category(wiki, A_FAMOUS_CATEGORY); assertNotNull(p); assertEquals(A_FAMOUS_CATEGORY, p.getTitle().getPlainTitle()); assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); } } @Test - public void testCreateCategoryByNameRandom() { + public void testCreateCategoryByNameRandom() + { try { new Category(wiki, UUID.randomUUID().toString()); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { // this is expected, as a random page title should not be found } } @Test - public void testCreateCategoryByNameEmpty() { + public void testCreateCategoryByNameEmpty() + { try { new Category(wiki, ""); - } catch (WikiPageNotFoundException pnfe) { + } + catch (WikiPageNotFoundException pnfe) { // this is expected behavior here, provoked by the test - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); } } @Test - public void testCreateCategoryByNameNull() { + public void testCreateCategoryByNameNull() + { try { new Category(wiki, null); - } catch (WikiPageNotFoundException pnfe) { + } + catch (WikiPageNotFoundException pnfe) { // this is expected behavior here, provoked by the test - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); } } @Test - public void testGetCategoryInfo() { + public void testGetCategoryInfo() + { try { Category p = new Category(wiki, "UKP"); assertNotNull(p); String categoryInfo = p.getCategoryInfo(); assertNotNull(categoryInfo); assertTrue(categoryInfo.length() > 0); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the page info: " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the page info: " + + e.getLocalizedMessage()); } } @Test - public void testCategorySiblings(){ + public void testCategorySiblings() + { Category cat; try { cat = wiki.getCategory(A_FAMOUS_CATEGORY); assertNotNull(cat); Set<Integer> expectedPageIds = new HashSet<>(); Set<Integer> isIds = new HashSet<>(); - //test the children + // test the children expectedPageIds.add(7); expectedPageIds.add(8); expectedPageIds.add(9); Set<Category> siblings = cat.getSiblings(); assertNotNull(siblings); assertTrue(siblings.size() > 0); - for(Category sibling : siblings) { + for (Category sibling : siblings) { isIds.add(sibling.getPageId()); } assertEquals(expectedPageIds, isIds, "siblings"); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the category 'People of UKP': " + e.getLocalizedMessage(), e); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the category 'People of UKP': " + + e.getLocalizedMessage(), e); } } @Test - public void testCategoryTitleComparatorEquality() { + public void testCategoryTitleComparatorEquality() + { Category cat1; Category cat2; try { @@ -375,13 +434,16 @@ public void testCategoryTitleComparatorEquality() { assertEquals(cat1, categories.get(0)); assertEquals(cat2, categories.get(1)); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while comparing the category 'People of UKP': " + e.getLocalizedMessage(), e); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while comparing the category 'People of UKP': " + + e.getLocalizedMessage(), e); } } @Test - public void testCategoryTitleComparatorNewOrder() { + public void testCategoryTitleComparatorNewOrder() + { Category cat1; Category cat2; try { @@ -399,8 +461,10 @@ public void testCategoryTitleComparatorNewOrder() { assertEquals(cat2, categories.get(0)); assertEquals(cat1, categories.get(1)); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while comparing the category 'People of UKP': " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while comparing the category 'People of UKP': " + + e.getLocalizedMessage()); } } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java index 877b4182..a9e27820 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/MetaDataTest.java @@ -29,54 +29,62 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -public class MetaDataTest extends BaseJWPLTest { +public class MetaDataTest + extends BaseJWPLTest +{ // The object under test private MetaData metaData; /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions */ @BeforeAll - public static void setupWikipedia() { + public static void setupWikipedia() + { DatabaseConfiguration db = obtainHSDLDBConfiguration(); try { wiki = new Wikipedia(db); - } catch (Exception e) { + } + catch (Exception e) { fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); } } @BeforeEach - public void setup() { + public void setup() + { metaData = new MetaData(wiki); } @AfterEach - public void tearDown() { + public void tearDown() + { metaData = null; } @Test - public void testGetNumberOfCategories() { + public void testGetNumberOfCategories() + { long numberOfCategories = metaData.getNumberOfCategories(); assertTrue(numberOfCategories > 0); assertEquals(17, numberOfCategories); } @Test - public void testGetNumberOfPages() { + public void testGetNumberOfPages() + { long numberOfPages = metaData.getNumberOfPages(); assertTrue(numberOfPages > 0); assertEquals(36, numberOfPages); } @Test - public void testGetNumberOfDisambiguationPages() { + public void testGetNumberOfDisambiguationPages() + { long numberOfDisambiguationPages = metaData.getNumberOfDisambiguationPages(); assertTrue(numberOfDisambiguationPages > 0); assertEquals(2, numberOfDisambiguationPages); @@ -84,50 +92,61 @@ public void testGetNumberOfDisambiguationPages() { } @Test - public void testGetNumberOfRedirectPages() { + public void testGetNumberOfRedirectPages() + { long numberOfRedirectPages = metaData.getNumberOfRedirectPages(); assertTrue(numberOfRedirectPages > 0); assertEquals(6, numberOfRedirectPages); } @Test - public void testGetMainCategory() { + public void testGetMainCategory() + { try { Category c = metaData.getMainCategory(); assertNotNull(c); assertEquals(1, c.getPageId()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the main category: " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the main category: " + + e.getLocalizedMessage()); } } @Test - public void testGetDisambiguationCategory() { + public void testGetDisambiguationCategory() + { try { Category c = metaData.getDisambiguationCategory(); assertNotNull(c); assertEquals(200, c.getPageId()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the disambiguation category: " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the disambiguation category: " + + e.getLocalizedMessage()); } } @Test - public void testGetLanguage() { + public void testGetLanguage() + { WikiConstants.Language language = metaData.getLanguage(); assertNotNull(language); assertEquals(WikiConstants.Language._test, language); } @Test - public void testGetVersion() { + public void testGetVersion() + { try { String version = metaData.getVersion(); assertNotNull(version); assertFalse(version.isEmpty()); assertEquals("1.0", version); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the disambiguation category: " + e.getLocalizedMessage()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the disambiguation category: " + + e.getLocalizedMessage()); } } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java index 9cf00e96..be9dd242 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageIteratorTest.java @@ -26,69 +26,73 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class PageIteratorTest extends BaseJWPLTest { +public class PageIteratorTest + extends BaseJWPLTest +{ - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: "+e.getLocalizedMessage()); - } - } + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + } + /** + * The test wikipedia contains 29 articles + 2 disambiguation + 1 discussion pages. + */ + @Test + public void test_pageIteratorTest() + { + int nrOfPages = 0; + int nrOfArticles = 0; - /** - * The test wikipedia contains 29 articles + 2 disambiguation + 1 discussion pages. - */ - @Test - public void test_pageIteratorTest() { - int nrOfPages = 0; - int nrOfArticles = 0; - - Iterator<Page> pageIter = wiki.getPages().iterator(); + Iterator<Page> pageIter = wiki.getPages().iterator(); Iterator<Page> articleIter = wiki.getArticles().iterator(); - while (pageIter.hasNext()) { - Page p = pageIter.next(); - assertNotNull(p); - nrOfPages++; - } - assertEquals(33, nrOfPages, "Number of pages == 33"); + while (pageIter.hasNext()) { + Page p = pageIter.next(); + assertNotNull(p); + nrOfPages++; + } + assertEquals(33, nrOfPages, "Number of pages == 33"); + + while (articleIter.hasNext()) { + Page p = articleIter.next(); + assertNotNull(p); + nrOfArticles++; + } - while (articleIter.hasNext()) { - Page p = articleIter.next(); - assertNotNull(p); - nrOfArticles++; - } - - // Assuming 33 is the correct number now - assertEquals(33, nrOfArticles, "Number of articles == 33"); + // Assuming 33 is the correct number now + assertEquals(33, nrOfArticles, "Number of articles == 33"); - } + } - /** - * The test wikipedia contains 30 articles + 2 disambiguation + 1 discussion pages. - */ - @Test - public void test_pageIteratorTestBufferSize() { + /** + * The test wikipedia contains 30 articles + 2 disambiguation + 1 discussion pages. + */ + @Test + public void test_pageIteratorTestBufferSize() + { - for (int bufferSize=1;bufferSize<=100;bufferSize+=5) { - Iterator<Page> pageIter = wiki.getPages(bufferSize).iterator(); - int nrOfPages = 0; - while (pageIter.hasNext()) { - @SuppressWarnings("unused") - Page p = pageIter.next(); - nrOfPages++; - } - assertEquals(33, nrOfPages, "Number of pages == 33"); - } - } + for (int bufferSize = 1; bufferSize <= 100; bufferSize += 5) { + Iterator<Page> pageIter = wiki.getPages(bufferSize).iterator(); + int nrOfPages = 0; + while (pageIter.hasNext()) { + @SuppressWarnings("unused") + Page p = pageIter.next(); + nrOfPages++; + } + assertEquals(33, nrOfPages, "Number of pages == 33"); + } + } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java index 3f617e2a..752112fa 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/PageTest.java @@ -38,419 +38,472 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class PageTest extends BaseJWPLTest { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private static final String A_FAMOUS_PAGE = "Wikipedia API"; - private static final int A_FAMOUS_PAGE_ID = 1014; - // Here: ORMs internal object identifier aka Primary Key. - private static final long A_FAMOUS_PAGE_OBJECT_ID = 1; - - // The object under test - private Page page; - - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); - } - } - - @BeforeEach - public void setup() { - page = fetchPage(A_FAMOUS_PAGE); - assertNotNull(page); - } - - @AfterEach - public void tearDown() { - page = null; - } - - @Test - public void testGetTitle() throws Exception { - Title t = page.getTitle(); - assertNotNull(t); - assertEquals(A_FAMOUS_PAGE, t.getPlainTitle(), "testing the title"); - assertEquals(A_FAMOUS_PAGE_ID, page.getPageId(), "testing the pageId"); - } - - @Test - public void testGetTitleExact() throws Exception { - // Query page here by its exact title containing '_'. - String title = "Wikipedia_API"; - Page p = fetchPage(title); - assertNotNull(p); - Title t = p.getTitle(); - assertNotNull(t); - assertEquals(title, p.getTitle().getRawTitleText(), "testing the title"); - assertEquals(A_FAMOUS_PAGE_ID, p.getPageId(), "testing the pageId"); - } - - @Test - public void testGetText() { - String expectedMarkupText = "Wikipedia API ist die wichtigste [[Software]] überhaupt.\n" + - "[[JWPL|Wikipedia API]].\n\n" + - "*Nicht zu übertreffen.\n\n" + - "*Unglaublich\n\n" + - "*[[http://www.ukp.tu-darmstadt.de]]\n\n" + - "[[en:Wikipedia API]] [[fi:WikipediaAPI]]"; - try { - String textWithMarkup = page.getText(); - assertNotNull(textWithMarkup); - assertTrue(textWithMarkup.length() > 0); - assertEquals(expectedMarkupText, textWithMarkup); - } catch (RuntimeException e) { - fail("A RuntimeException occurred while accessing the page for its text (markup): " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPlainText() { - String expectedPlainText = "Wikipedia API ist die wichtigste Software überhaupt.\n" + - "Wikipedia API.\n" + - "Nicht zu übertreffen.\n" + - "Unglaublich\n" + - "http://www.ukp.tu-darmstadt.de\n" + - "en:Wikipedia API fi:WikipediaAPI"; - try { - assertEquals(expectedPlainText, page.getPlainText()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while parsing the page for its text (plain via Sweble): " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPlainTextWithTable() { - String title = "Humanbiologie"; - String expectedPlainTextOfTable = - "Die Attraktivität der Fachrichtungen Humanbiologie beziehungsweise Biomedizin als " + - "Studienfächer ist in jüngerer Zeit deutlich gestiegen.\n" + - "\n" + - "Studiengang|besteht seit|Abschluss|Hochschule\n" + - "Humanbiologie (Biomedical Science)|1979|Bachelor / Master|Marburg (U)\n" + - "Molekulare Biomedizin|2014|Bachelor|Rheinische Fachhochschule Köln \n" + - "Kategorie:Biologie Kategorie:Medizin Kategorie:Humangenetik Kategorie:Studienfach"; - try { - Page page = fetchPage(title); - assertNotNull(page); - assertEquals(6000, page.getPageId()); - assertEquals(expectedPlainTextOfTable, page.getPlainText()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while parsing the page for its text (plain with table via Sweble): " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPlainTextWithXMLEndTag() { - String title = "Liste_von_Materia_Medica_der_traditionellen_uigurischen_Medizin"; - String expectedPlainText = "Dies ist eine Liste von Materia Medica der traditionellen uigurischen Medizin. " + - "Die uigurische Medizin entwickelte sich aus der arabischen Medizin, der antiken griechischen Medizin " + - "und der traditionellen chinesischen Medizin. " + - "Übersicht" + "\n" + - "Quellen: [cintcm.com], [tcm-resources.com]"; - try { - Page page = fetchPage(title); - assertNotNull(page); - assertEquals(6001, page.getPageId()); - assertEquals(expectedPlainText, page.getPlainText()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while parsing the page for its text (plain with table via Sweble): " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetNumberOfCategories() { - int categories = page.getNumberOfCategories(); - assertTrue(categories > 0); - assertEquals(2, categories); - } - - @Test - public void testGetCategories() { - Set<Category> categories = page.getCategories(); - assertNotNull(categories); - assertFalse(categories.isEmpty()); - assertEquals(2, categories.size()); - try { - boolean foundSIR = false; - boolean foundDisambiguation = false; - for (Category c : categories) { - String ct = c.getTitle().getPlainTitle(); - assertNotNull(ct); - if ("SIR".equals(ct)) { - foundSIR = true; - } - if ("Disambiguation".equals(ct)) { - foundDisambiguation = true; - } - } - assertTrue(foundSIR); - assertTrue(foundDisambiguation); - } catch (WikiTitleParsingException e) { - fail("A WikiTitleParsingException occurred while accessing the category title of the page: " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetNumberOfInlinks() { - int inlinks = page.getNumberOfInlinks(); - assertTrue(inlinks > 0); - assertEquals(3, inlinks); - } - - @Test - public void testGetNumberOfInlinksZero() { - int inlinks = fetchPage("Unconnected_page").getNumberOfInlinks(); - assertEquals(0, inlinks); - } - - @Test - public void testGetInlinks() { - Set<Page> inlinks = page.getInlinks(); - assertNotNull(inlinks); - assertFalse(inlinks.isEmpty()); - assertEquals(3, inlinks.size()); - } - - @Test - public void testGetInlinkIDs() { - Set<Integer> inlinkIDs = page.getInlinkIDs(); - assertNotNull(inlinkIDs); - assertFalse(inlinkIDs.isEmpty()); - assertEquals(3, inlinkIDs.size()); - } - - @Test - public void testGetNumberOfOutlinks() { - int outlinks = page.getNumberOfOutlinks(); - assertTrue(outlinks > 0); - assertEquals(1, outlinks); - } - - @Test - public void testGetNumberOfOutlinksZero() { - int outlinks = fetchPage("Unconnected_page").getNumberOfOutlinks(); - assertEquals(0, outlinks); - } - - @Test - public void testGetOutlinks() { - Set<Page> outlinks = page.getOutlinks(); - assertNotNull(outlinks); - assertFalse(outlinks.isEmpty()); - assertEquals(1, outlinks.size()); - Page outlink = outlinks.iterator().next(); - assertNotNull(outlink); - try { - assertEquals("Torsten Zesch", outlink.getTitle().getPlainTitle()); - } catch (WikiTitleParsingException e) { - fail("A WikiTitleParsingException occurred while accessing the title of the page: " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetOutlinkIDs() { - Set<Integer> outlinkIDs = page.getOutlinkIDs(); - assertNotNull(outlinkIDs); - assertFalse(outlinkIDs.isEmpty()); - assertEquals(1, outlinkIDs.size()); - } - - @Test - public void testGetRedirects() { - Set<String> redirects = page.getRedirects(); - assertNotNull(redirects); - assertTrue(redirects.isEmpty()); - } - - @Test - public void testIsRedirect() { - assertFalse(page.isRedirect()); - } - - @Test - public void testIsRedirectValid() { - Page p = fetchPage("SIR"); - assertNotNull(p); - assertTrue(p.isRedirect()); - } - - @Test - public void testIsDisambiguation() { - assertFalse(page.isDisambiguation()); - } - - @Test - public void testIsDiscussion() { - try { - assertFalse(page.isDiscussion()); - } catch (WikiTitleParsingException e) { - fail("A WikiTitleParsingException occurred while accessing the page for discussion: " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPageInfo() { - try { - String pageInfo = page.getPageInfo(); - assertNotNull(pageInfo); - assertTrue(pageInfo.length() > 0); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the page info: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByPageID() { - try { - Page p = new Page(wiki, A_FAMOUS_PAGE_ID); - assertNotNull(p); - assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByPageIDInvalid() { - try { - new Page(wiki, -42); - } catch (WikiPageNotFoundException pnfe) { - // this is expected behavior here, provoked by the test - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByObjectID() { - try { - Page p = new Page(wiki, A_FAMOUS_PAGE_OBJECT_ID); - assertNotNull(p); - assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); - assertEquals(A_FAMOUS_PAGE_OBJECT_ID, p.__getId()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByObjectIDInvalid() { - try { - long invalidObjectID = -42L; - new Page(wiki, invalidObjectID); - } catch (WikiPageNotFoundException pnfe) { - // this is expected behavior here, provoked by the test - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByName() { - try { - Page p = new Page(wiki, A_FAMOUS_PAGE); - assertNotNull(p); - assertEquals(A_FAMOUS_PAGE, p.getTitle().getPlainTitle()); - assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByNameEmpty() { - try { - new Page(wiki, ""); - } catch (WikiPageNotFoundException pnfe) { - // this is expected behavior here, provoked by the test - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByNameNull() { - try { - new Page(wiki, null); - } catch (WikiPageNotFoundException pnfe) { - // this is expected behavior here, provoked by the test - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testCreatePageByNameExactDiscussion() { - try { - Page p = new Page(wiki, "Discussion:Wikipedia_API", false); - assertNotNull(p); - assertTrue(p.isDiscussion()); - assertEquals(4000, p.getPageId()); - } catch (WikiApiException e) { - fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); - } - } - - @Test - public void testPageTitleComparatorEquality() { - Page page1 = fetchPage(A_FAMOUS_PAGE); - assertNotNull(page1); - Page page2 = fetchPage(A_FAMOUS_PAGE); - assertNotNull(page2); - - List<Page> pages = new ArrayList<>(); - pages.add(page1); - pages.add(page2); - pages.sort(new PageTitleComparator()); - - assertEquals(page1, pages.get(0)); - assertEquals(page2, pages.get(1)); - } - - @Test - public void testCategoryTitleComparatorNewOrder() { - Page page1 = fetchPage(A_FAMOUS_PAGE); - assertNotNull(page1); - // this page should be re-ordered before "Wikipedia..." - Page page2 = fetchPage("Unconnected_page"); - assertNotNull(page2); - - List<Page> pages = new ArrayList<>(); - pages.add(page1); - pages.add(page2); - pages.sort(new PageTitleComparator()); - - assertEquals(page2, pages.get(0)); - assertEquals(page1, pages.get(1)); - } - - private Page fetchPage(final String title) { - Page page = null; - try { - page = wiki.getPage(title); - } catch (WikiApiException e) { - logger.error(e.getLocalizedMessage(), e); - fail("A WikiApiException occurred while getting the page: " + e.getLocalizedMessage()); - } - return page; - } +public class PageTest + extends BaseJWPLTest +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private static final String A_FAMOUS_PAGE = "Wikipedia API"; + private static final int A_FAMOUS_PAGE_ID = 1014; + // Here: ORMs internal object identifier aka Primary Key. + private static final long A_FAMOUS_PAGE_OBJECT_ID = 1; + + // The object under test + private Page page; + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + } + + @BeforeEach + public void setup() + { + page = fetchPage(A_FAMOUS_PAGE); + assertNotNull(page); + } + + @AfterEach + public void tearDown() + { + page = null; + } + + @Test + public void testGetTitle() throws Exception + { + Title t = page.getTitle(); + assertNotNull(t); + assertEquals(A_FAMOUS_PAGE, t.getPlainTitle(), "testing the title"); + assertEquals(A_FAMOUS_PAGE_ID, page.getPageId(), "testing the pageId"); + } + + @Test + public void testGetTitleExact() throws Exception + { + // Query page here by its exact title containing '_'. + String title = "Wikipedia_API"; + Page p = fetchPage(title); + assertNotNull(p); + Title t = p.getTitle(); + assertNotNull(t); + assertEquals(title, p.getTitle().getRawTitleText(), "testing the title"); + assertEquals(A_FAMOUS_PAGE_ID, p.getPageId(), "testing the pageId"); + } + + @Test + public void testGetText() + { + String expectedMarkupText = "Wikipedia API ist die wichtigste [[Software]] überhaupt.\n" + + "[[JWPL|Wikipedia API]].\n\n" + "*Nicht zu übertreffen.\n\n" + "*Unglaublich\n\n" + + "*[[http://www.ukp.tu-darmstadt.de]]\n\n" + + "[[en:Wikipedia API]] [[fi:WikipediaAPI]]"; + try { + String textWithMarkup = page.getText(); + assertNotNull(textWithMarkup); + assertTrue(textWithMarkup.length() > 0); + assertEquals(expectedMarkupText, textWithMarkup); + } + catch (RuntimeException e) { + fail("A RuntimeException occurred while accessing the page for its text (markup): " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPlainText() + { + String expectedPlainText = "Wikipedia API ist die wichtigste Software überhaupt.\n" + + "Wikipedia API.\n" + "Nicht zu übertreffen.\n" + "Unglaublich\n" + + "http://www.ukp.tu-darmstadt.de\n" + "en:Wikipedia API fi:WikipediaAPI"; + try { + assertEquals(expectedPlainText, page.getPlainText()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while parsing the page for its text (plain via Sweble): " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPlainTextWithTable() + { + String title = "Humanbiologie"; + String expectedPlainTextOfTable = "Die Attraktivität der Fachrichtungen Humanbiologie beziehungsweise Biomedizin als " + + "Studienfächer ist in jüngerer Zeit deutlich gestiegen.\n" + "\n" + + "Studiengang|besteht seit|Abschluss|Hochschule\n" + + "Humanbiologie (Biomedical Science)|1979|Bachelor / Master|Marburg (U)\n" + + "Molekulare Biomedizin|2014|Bachelor|Rheinische Fachhochschule Köln \n" + + "Kategorie:Biologie Kategorie:Medizin Kategorie:Humangenetik Kategorie:Studienfach"; + try { + Page page = fetchPage(title); + assertNotNull(page); + assertEquals(6000, page.getPageId()); + assertEquals(expectedPlainTextOfTable, page.getPlainText()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while parsing the page for its text (plain with table via Sweble): " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPlainTextWithXMLEndTag() + { + String title = "Liste_von_Materia_Medica_der_traditionellen_uigurischen_Medizin"; + String expectedPlainText = "Dies ist eine Liste von Materia Medica der traditionellen uigurischen Medizin. " + + "Die uigurische Medizin entwickelte sich aus der arabischen Medizin, der antiken griechischen Medizin " + + "und der traditionellen chinesischen Medizin. " + "Übersicht" + "\n" + + "Quellen: [cintcm.com], [tcm-resources.com]"; + try { + Page page = fetchPage(title); + assertNotNull(page); + assertEquals(6001, page.getPageId()); + assertEquals(expectedPlainText, page.getPlainText()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while parsing the page for its text (plain with table via Sweble): " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetNumberOfCategories() + { + int categories = page.getNumberOfCategories(); + assertTrue(categories > 0); + assertEquals(2, categories); + } + + @Test + public void testGetCategories() + { + Set<Category> categories = page.getCategories(); + assertNotNull(categories); + assertFalse(categories.isEmpty()); + assertEquals(2, categories.size()); + try { + boolean foundSIR = false; + boolean foundDisambiguation = false; + for (Category c : categories) { + String ct = c.getTitle().getPlainTitle(); + assertNotNull(ct); + if ("SIR".equals(ct)) { + foundSIR = true; + } + if ("Disambiguation".equals(ct)) { + foundDisambiguation = true; + } + } + assertTrue(foundSIR); + assertTrue(foundDisambiguation); + } + catch (WikiTitleParsingException e) { + fail("A WikiTitleParsingException occurred while accessing the category title of the page: " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetNumberOfInlinks() + { + int inlinks = page.getNumberOfInlinks(); + assertTrue(inlinks > 0); + assertEquals(3, inlinks); + } + + @Test + public void testGetNumberOfInlinksZero() + { + int inlinks = fetchPage("Unconnected_page").getNumberOfInlinks(); + assertEquals(0, inlinks); + } + + @Test + public void testGetInlinks() + { + Set<Page> inlinks = page.getInlinks(); + assertNotNull(inlinks); + assertFalse(inlinks.isEmpty()); + assertEquals(3, inlinks.size()); + } + + @Test + public void testGetInlinkIDs() + { + Set<Integer> inlinkIDs = page.getInlinkIDs(); + assertNotNull(inlinkIDs); + assertFalse(inlinkIDs.isEmpty()); + assertEquals(3, inlinkIDs.size()); + } + + @Test + public void testGetNumberOfOutlinks() + { + int outlinks = page.getNumberOfOutlinks(); + assertTrue(outlinks > 0); + assertEquals(1, outlinks); + } + + @Test + public void testGetNumberOfOutlinksZero() + { + int outlinks = fetchPage("Unconnected_page").getNumberOfOutlinks(); + assertEquals(0, outlinks); + } + + @Test + public void testGetOutlinks() + { + Set<Page> outlinks = page.getOutlinks(); + assertNotNull(outlinks); + assertFalse(outlinks.isEmpty()); + assertEquals(1, outlinks.size()); + Page outlink = outlinks.iterator().next(); + assertNotNull(outlink); + try { + assertEquals("Torsten Zesch", outlink.getTitle().getPlainTitle()); + } + catch (WikiTitleParsingException e) { + fail("A WikiTitleParsingException occurred while accessing the title of the page: " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetOutlinkIDs() + { + Set<Integer> outlinkIDs = page.getOutlinkIDs(); + assertNotNull(outlinkIDs); + assertFalse(outlinkIDs.isEmpty()); + assertEquals(1, outlinkIDs.size()); + } + + @Test + public void testGetRedirects() + { + Set<String> redirects = page.getRedirects(); + assertNotNull(redirects); + assertTrue(redirects.isEmpty()); + } + + @Test + public void testIsRedirect() + { + assertFalse(page.isRedirect()); + } + + @Test + public void testIsRedirectValid() + { + Page p = fetchPage("SIR"); + assertNotNull(p); + assertTrue(p.isRedirect()); + } + + @Test + public void testIsDisambiguation() + { + assertFalse(page.isDisambiguation()); + } + + @Test + public void testIsDiscussion() + { + try { + assertFalse(page.isDiscussion()); + } + catch (WikiTitleParsingException e) { + fail("A WikiTitleParsingException occurred while accessing the page for discussion: " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPageInfo() + { + try { + String pageInfo = page.getPageInfo(); + assertNotNull(pageInfo); + assertTrue(pageInfo.length() > 0); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the page info: " + + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByPageID() + { + try { + Page p = new Page(wiki, A_FAMOUS_PAGE_ID); + assertNotNull(p); + assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByPageIDInvalid() + { + try { + new Page(wiki, -42); + } + catch (WikiPageNotFoundException pnfe) { + // this is expected behavior here, provoked by the test + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByObjectID() + { + try { + Page p = new Page(wiki, A_FAMOUS_PAGE_OBJECT_ID); + assertNotNull(p); + assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); + assertEquals(A_FAMOUS_PAGE_OBJECT_ID, p.__getId()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByObjectIDInvalid() + { + try { + long invalidObjectID = -42L; + new Page(wiki, invalidObjectID); + } + catch (WikiPageNotFoundException pnfe) { + // this is expected behavior here, provoked by the test + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByName() + { + try { + Page p = new Page(wiki, A_FAMOUS_PAGE); + assertNotNull(p); + assertEquals(A_FAMOUS_PAGE, p.getTitle().getPlainTitle()); + assertEquals(A_FAMOUS_PAGE_ID, p.getPageId()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByNameEmpty() + { + try { + new Page(wiki, ""); + } + catch (WikiPageNotFoundException pnfe) { + // this is expected behavior here, provoked by the test + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByNameNull() + { + try { + new Page(wiki, null); + } + catch (WikiPageNotFoundException pnfe) { + // this is expected behavior here, provoked by the test + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testCreatePageByNameExactDiscussion() + { + try { + Page p = new Page(wiki, "Discussion:Wikipedia_API", false); + assertNotNull(p); + assertTrue(p.isDiscussion()); + assertEquals(4000, p.getPageId()); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred creating a page: " + e.getLocalizedMessage()); + } + } + + @Test + public void testPageTitleComparatorEquality() + { + Page page1 = fetchPage(A_FAMOUS_PAGE); + assertNotNull(page1); + Page page2 = fetchPage(A_FAMOUS_PAGE); + assertNotNull(page2); + + List<Page> pages = new ArrayList<>(); + pages.add(page1); + pages.add(page2); + pages.sort(new PageTitleComparator()); + + assertEquals(page1, pages.get(0)); + assertEquals(page2, pages.get(1)); + } + + @Test + public void testCategoryTitleComparatorNewOrder() + { + Page page1 = fetchPage(A_FAMOUS_PAGE); + assertNotNull(page1); + // this page should be re-ordered before "Wikipedia..." + Page page2 = fetchPage("Unconnected_page"); + assertNotNull(page2); + + List<Page> pages = new ArrayList<>(); + pages.add(page1); + pages.add(page2); + pages.sort(new PageTitleComparator()); + + assertEquals(page2, pages.get(0)); + assertEquals(page1, pages.get(1)); + } + + private Page fetchPage(final String title) + { + Page page = null; + try { + page = wiki.getPage(title); + } + catch (WikiApiException e) { + logger.error(e.getLocalizedMessage(), e); + fail("A WikiApiException occurred while getting the page: " + e.getLocalizedMessage()); + } + return page; + } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java index ecf3545b..25d4fe9b 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleIteratorTest.java @@ -24,36 +24,39 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class TitleIteratorTest extends BaseJWPLTest{ - - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: "+e.getLocalizedMessage()); - } - } +public class TitleIteratorTest + extends BaseJWPLTest +{ + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + } + @Test + public void test_titleIteratorTest() + { - @Test - public void test_titleIteratorTest() { + int nrOfTitles = 0; + Iterable<Title> iterable = wiki.getTitles(); + assertNotNull(iterable); + for (Title t : iterable) { + assertNotNull(t); + nrOfTitles++; + } + assertEquals(39, nrOfTitles, "Number of titles == 39"); - int nrOfTitles = 0; - Iterable<Title> iterable = wiki.getTitles(); - assertNotNull(iterable); - for (Title t : iterable) { - assertNotNull(t); - nrOfTitles++; } - assertEquals(39, nrOfTitles, "Number of titles == 39"); - - } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java index 01ea2e99..ca502a2b 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/TitleTest.java @@ -23,102 +23,111 @@ import org.dkpro.jwpl.api.exception.WikiTitleParsingException; import org.junit.jupiter.api.Test; -public class TitleTest { - - @Test - public void titleTest1() throws WikiTitleParsingException { - - Title t = new Title("Car"); - assertEquals("Car", t.getEntity()); - assertEquals("Car", t.getPlainTitle()); - assertEquals("Car", t.getRawTitleText()); - assertNull(t.getDisambiguationText()); - assertNull(t.getSectionText()); - assertEquals("Car", t.getWikiStyleTitle()); - } - - @Test - public void titleTest2() throws WikiTitleParsingException { - - Title t = new Title("Car_(automobile)"); - assertEquals("Car", t.getEntity()); - assertEquals("Car (automobile)", t.getPlainTitle()); - assertEquals("Car_(automobile)", t.getRawTitleText()); - assertEquals("automobile", t.getDisambiguationText()); - assertNull(t.getSectionText()); - assertEquals("Car_(automobile)", t.getWikiStyleTitle()); - } - - @Test - public void titleTest3() throws WikiTitleParsingException { - - Title t = new Title("Car (automobile)"); - assertEquals("Car", t.getEntity()); - assertEquals("Car (automobile)", t.getPlainTitle()); - assertEquals("Car (automobile)", t.getRawTitleText()); - assertEquals("automobile", t.getDisambiguationText()); - assertNull(t.getSectionText()); - assertEquals("Car_(automobile)", t.getWikiStyleTitle()); - } - - @Test - public void titleTest4() throws WikiTitleParsingException { - - Title t = new Title("Car (automobile)#Introduction"); - assertEquals("Car", t.getEntity()); - assertEquals("Car (automobile)", t.getPlainTitle()); - assertEquals("Car (automobile)#Introduction", t.getRawTitleText()); - assertEquals("automobile", t.getDisambiguationText()); - assertEquals("Introduction", t.getSectionText()); - assertEquals("Car_(automobile)", t.getWikiStyleTitle()); - } - - @Test - public void titleTest5() throws WikiTitleParsingException { - - Title t = new Title("Car_(automobile)#Introduction"); - assertEquals("Car", t.getEntity()); - assertEquals("Car (automobile)", t.getPlainTitle()); - assertEquals("Car_(automobile)#Introduction", t.getRawTitleText()); - assertEquals("automobile", t.getDisambiguationText()); - assertEquals("Introduction", t.getSectionText()); - assertEquals("Car_(automobile)", t.getWikiStyleTitle()); - } +public class TitleTest +{ @Test - public void titleTest6() throws WikiTitleParsingException { + public void titleTest1() throws WikiTitleParsingException + { + + Title t = new Title("Car"); + assertEquals("Car", t.getEntity()); + assertEquals("Car", t.getPlainTitle()); + assertEquals("Car", t.getRawTitleText()); + assertNull(t.getDisambiguationText()); + assertNull(t.getSectionText()); + assertEquals("Car", t.getWikiStyleTitle()); + } + + @Test + public void titleTest2() throws WikiTitleParsingException + { + + Title t = new Title("Car_(automobile)"); + assertEquals("Car", t.getEntity()); + assertEquals("Car (automobile)", t.getPlainTitle()); + assertEquals("Car_(automobile)", t.getRawTitleText()); + assertEquals("automobile", t.getDisambiguationText()); + assertNull(t.getSectionText()); + assertEquals("Car_(automobile)", t.getWikiStyleTitle()); + } + + @Test + public void titleTest3() throws WikiTitleParsingException + { + + Title t = new Title("Car (automobile)"); + assertEquals("Car", t.getEntity()); + assertEquals("Car (automobile)", t.getPlainTitle()); + assertEquals("Car (automobile)", t.getRawTitleText()); + assertEquals("automobile", t.getDisambiguationText()); + assertNull(t.getSectionText()); + assertEquals("Car_(automobile)", t.getWikiStyleTitle()); + } + + @Test + public void titleTest4() throws WikiTitleParsingException + { + + Title t = new Title("Car (automobile)#Introduction"); + assertEquals("Car", t.getEntity()); + assertEquals("Car (automobile)", t.getPlainTitle()); + assertEquals("Car (automobile)#Introduction", t.getRawTitleText()); + assertEquals("automobile", t.getDisambiguationText()); + assertEquals("Introduction", t.getSectionText()); + assertEquals("Car_(automobile)", t.getWikiStyleTitle()); + } + + @Test + public void titleTest5() throws WikiTitleParsingException + { + + Title t = new Title("Car_(automobile)#Introduction"); + assertEquals("Car", t.getEntity()); + assertEquals("Car (automobile)", t.getPlainTitle()); + assertEquals("Car_(automobile)#Introduction", t.getRawTitleText()); + assertEquals("automobile", t.getDisambiguationText()); + assertEquals("Introduction", t.getSectionText()); + assertEquals("Car_(automobile)", t.getWikiStyleTitle()); + } + + @Test + public void titleTest6() throws WikiTitleParsingException + { Title t = new Title("Car#Introduction"); assertEquals("Car", t.getEntity()); assertEquals("Car", t.getPlainTitle()); assertEquals("Car#Introduction", t.getRawTitleText()); - assertNull(t.getDisambiguationText()); - assertEquals("Introduction", t.getSectionText()); + assertNull(t.getDisambiguationText()); + assertEquals("Introduction", t.getSectionText()); assertEquals("Car", t.getWikiStyleTitle()); } - @Test - public void titleTest7() throws WikiTitleParsingException { - - Title t = new Title("401(k)"); - assertEquals("401(k)", t.getEntity()); - assertEquals("401(k)", t.getPlainTitle()); - assertEquals("401(k)", t.getRawTitleText()); - assertNull(t.getDisambiguationText()); - assertNull(t.getSectionText()); - assertEquals("401(k)", t.getWikiStyleTitle()); - } - - @Test - public void titleTest8() throws WikiTitleParsingException { - - Title t = new Title("Ytterbium(III)_chloride_(data_page)"); - assertEquals("Ytterbium(III) chloride", t.getEntity()); - assertEquals("Ytterbium(III) chloride (data page)", t.getPlainTitle()); - assertEquals("Ytterbium(III)_chloride_(data_page)", t.getRawTitleText()); - assertEquals("data page", t.getDisambiguationText()); - assertNull(t.getSectionText()); - assertEquals("Ytterbium(III)_chloride_(data_page)", t.getWikiStyleTitle()); - } + @Test + public void titleTest7() throws WikiTitleParsingException + { + + Title t = new Title("401(k)"); + assertEquals("401(k)", t.getEntity()); + assertEquals("401(k)", t.getPlainTitle()); + assertEquals("401(k)", t.getRawTitleText()); + assertNull(t.getDisambiguationText()); + assertNull(t.getSectionText()); + assertEquals("401(k)", t.getWikiStyleTitle()); + } + + @Test + public void titleTest8() throws WikiTitleParsingException + { + + Title t = new Title("Ytterbium(III)_chloride_(data_page)"); + assertEquals("Ytterbium(III) chloride", t.getEntity()); + assertEquals("Ytterbium(III) chloride (data page)", t.getPlainTitle()); + assertEquals("Ytterbium(III)_chloride_(data_page)", t.getRawTitleText()); + assertEquals("data page", t.getDisambiguationText()); + assertNull(t.getSectionText()); + assertEquals("Ytterbium(III)_chloride_(data_page)", t.getWikiStyleTitle()); + } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java index a921f16c..6e5a9e76 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikiConfigTest.java @@ -22,18 +22,20 @@ import org.junit.jupiter.api.Test; import org.sweble.wikitext.engine.config.WikiConfig; -public class WikiConfigTest { +public class WikiConfigTest +{ @Test - public void testGetWikiConf() { + public void testGetWikiConf() + { WikiConfig portugueseConf = WikiConstants.Language.portuguese.getWikiconfig(); WikiConfig englishConf = WikiConstants.Language.english.getWikiconfig(); WikiConfig testConf = WikiConstants.Language._test.getWikiconfig(); WikiConfig frenchConf = WikiConstants.Language.french.getWikiconfig(); // assertion block - assertSame("pt", portugueseConf.getContentLanguage()); - assertSame("en", englishConf.getContentLanguage()); - assertSame("en", testConf.getContentLanguage()); - assertSame("fr", frenchConf.getContentLanguage()); + assertSame("pt", portugueseConf.getContentLanguage()); + assertSame("en", englishConf.getContentLanguage()); + assertSame("en", testConf.getContentLanguage()); + assertSame("fr", frenchConf.getContentLanguage()); } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java index 797dfca2..8b847bd3 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/WikipediaTest.java @@ -37,406 +37,483 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -public class WikipediaTest extends BaseJWPLTest{ - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private static Wikipedia wiki; - - private static final String A_FAMOUS_PAGE = "Exploring_the_Potential_of_Semantic_Relatedness_in_Information_Retrieval"; - private static final String A_FAMOUS_PAGE_CLEAN = "Exploring the Potential of Semantic Relatedness in Information Retrieval"; - private static final int A_FAMOUS_PAGE_ID = 1017; - - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration(); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: "+e.getLocalizedMessage()); - } - } - - /* - * We test the returned pages with testing their pageId and their title. - * We also expect a WikiApiException to be thrown when trying to get non existing page. - */ - @Test - public void testGetPageByTitleVariations() { - getExistingPage(A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); - getExistingPage("Exploring_the_Potential_of_Semantic_Relatedness_in_Information_Retrieval", A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); - getExistingPage("exploring the Potential of Semantic Relatedness in Information Retrieval", A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); - getExistingPage("exploring_the_Potential_of_Semantic_Relatedness_in_Information_Retrieval", A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); - getExistingPage("TK2", 105); - getNotExistingPage("TK2 "); - getNotExistingPage(" TK2"); - getNotExistingPage("TK4"); - getNotExistingPage(""); - getExistingPage("UKP", 1041); - /* - * TODO the following pages should NOT be found. They are found due to case insensitive querying - */ -// getNotExistingPage("Ukp"); -// getNotExistingPage("UkP"); -// getNotExistingPage("uKP"); - } - - @Test - public void testGetPageByID() { - checkPage(A_FAMOUS_PAGE_ID, null); - } - - @Test - public void testGetPageByTitle() { - checkPage(null, A_FAMOUS_PAGE); - } - - @Test - public void testGetPageByExactTitle() { - try { - checkGetPageByExactTitle("UKP"); - } catch (WikiPageNotFoundException nfe) { - fail("Encountered WikiPageNotFoundException: " + nfe.getLocalizedMessage()); - } catch (WikiApiException ae) { - fail("Encountered WikiApiException: " + ae.getLocalizedMessage()); - } - } - - @Test - public void testGetPageByExactTitleNull() { - try { - checkGetPageByExactTitle(null); - } catch (WikiPageNotFoundException nfe) { - // this is expected here - } catch (WikiApiException ae) { - fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " - + ae.getLocalizedMessage()); - } - } - - @Test - public void testGetPageByExactTitleEmpty() { - try { - checkGetPageByExactTitle(""); - } catch (WikiPageNotFoundException nfe) { - // this is expected here - } catch (WikiApiException ae) { - fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " - + ae.getLocalizedMessage()); - } - } - - @Test - public void testGetPagesByTitle() { - try { - Set<Page> pages = wiki.getPages(A_FAMOUS_PAGE); - assertNotNull(pages); - assertEquals(1, pages.size()); - assertEquals(A_FAMOUS_PAGE_ID, pages.iterator().next().getPageId()); - } catch (WikiApiException e) { - logger.error(e.getLocalizedMessage(), e); - fail("Encountered WikiApiException: " + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPageIdsByTitle() { - try { - List<Integer> pageIDs = wiki.getPageIds(A_FAMOUS_PAGE); - assertNotNull(pageIDs); - assertEquals(1, pageIDs.size()); - assertTrue(pageIDs.contains(A_FAMOUS_PAGE_ID)); - } catch (WikiApiException e) { - logger.error(e.getLocalizedMessage(), e); - fail("Encountered WikiApiException: " + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPageIdsByTitleInvalid() { - try { - wiki.getPageIds(UUID.randomUUID().toString()); - } catch (WikiPageNotFoundException wpnfe) { - // this is expected here - } catch (WikiApiException wae) { - fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " - + wae.getLocalizedMessage()); - } - } - - @Test - public void testGetPageIdsCaseInsensitive() { - try { - List<Integer> pageIDs = wiki.getPageIdsCaseInsensitive(A_FAMOUS_PAGE); - assertNotNull(pageIDs); - assertEquals(1, pageIDs.size()); - assertTrue(pageIDs.contains(A_FAMOUS_PAGE_ID)); - } catch (WikiApiException e) { - logger.error(e.getLocalizedMessage(), e); - fail("Encountered WikiApiException: " + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPageIdsCaseInsensitiveInvalid() { - try { - wiki.getPageIdsCaseInsensitive(UUID.randomUUID().toString()); - } catch (WikiPageNotFoundException wpnfe) { - // this is expected here - } catch (WikiApiException wae) { - fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " - + wae.getLocalizedMessage()); - } - } - - @Test - public void testExistsPageByTitle() { - assertTrue(wiki.existsPage(A_FAMOUS_PAGE)); - } - - @Test - public void testExistsPageByTitleInvalid() { - assertFalse(wiki.existsPage(A_FAMOUS_PAGE+"_")); - assertFalse(wiki.existsPage(A_FAMOUS_PAGE+" (X)")); - assertFalse(wiki.existsPage(" (X)")); - } - - @Test - public void testExistsPageByTitleNullOrEmpty() { - assertFalse(wiki.existsPage(null)); - assertFalse(wiki.existsPage("")); - } - - @Test - public void testExistsPageByID() { - assertTrue(wiki.existsPage(A_FAMOUS_PAGE_ID)); - } - - @Test - public void testExistsPageByIDInvalid1() { - assertFalse(wiki.existsPage(-42)); - } - - @Test - public void testExistsPageByIDInvalid2() { - assertFalse(wiki.existsPage(Integer.MAX_VALUE)); - } - - @Test - public void testGetTitleByID() { - try { - Title title = wiki.getTitle(A_FAMOUS_PAGE_ID); - assertNotNull(title); - assertEquals(A_FAMOUS_PAGE, title.getRawTitleText()); - assertEquals(A_FAMOUS_PAGE_CLEAN, title.getPlainTitle()); - } catch (WikiPageNotFoundException nfe) { - fail("Encountered WikiPageNotFoundException: " + nfe.getLocalizedMessage()); - } catch (WikiApiException ae) { - fail("Encountered WikiApiException: " + ae.getLocalizedMessage()); - } - } - - @Test - public void testGetTitleByIDInvalid() { - try { - wiki.getTitle(-42); - } catch (WikiPageNotFoundException wpnfe) { - // this is expected here - } catch (WikiApiException e) { - fail("Expected a WikiApiException, yet encountered WikiApiException: " - + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPageIds() { - Iterable<Integer> iterable = wiki.getPageIds(); - assertNotNull(iterable); - } - - @Test - public void testGetPagesByPageQuery() { - try { - PageQuery query = new PageQuery(); - // expected: ONE match - query.setTitlePattern(A_FAMOUS_PAGE+"%"); - Iterable<Page> iterable = wiki.getPages(query); - assertNotNull(iterable); - Page page = iterable.iterator().next(); - assertEquals(A_FAMOUS_PAGE_ID, page.getPageId()); - - // expected: ONE match - query.setOnlyArticlePages(true); - iterable = wiki.getPages(query); - assertNotNull(iterable); - page = iterable.iterator().next(); - assertEquals(A_FAMOUS_PAGE_ID, page.getPageId()); - - // expected: NO match - query.setOnlyArticlePages(false); - query.setOnlyDisambiguationPages(true); - iterable = wiki.getPages(query); - assertNotNull(iterable); - assertFalse(iterable.iterator().hasNext()); - - // expected: NO match - query.setTitlePattern(A_FAMOUS_PAGE+"_"); - iterable = wiki.getPages(query); - assertNotNull(iterable); - assertFalse(iterable.iterator().hasNext()); - } catch (WikiApiException e) { - fail("Encountered WikiApiException: " + e.getLocalizedMessage()); - } - } - - @Test - public void testGetPageHibernateId() { - long objectID = wiki.__getPageHibernateId(A_FAMOUS_PAGE_ID); - assertTrue(objectID > 0); - - // query a 2nd time to validate caching of IDs - assertEquals(objectID, wiki.__getPageHibernateId(A_FAMOUS_PAGE_ID)); - } - - @Test - public void testGetPageHibernateIdInvalid1() { - long objectID = wiki.__getPageHibernateId(-42); - assertEquals(-1, objectID); - } - - @Test - public void testGetPageHibernateIdInvalid2() { - long objectID = wiki.__getPageHibernateId(Integer.MAX_VALUE); - assertEquals(-1, objectID); - } - - @Test - public void testGetCategoryInvalid1() { - assertNull(wiki.getCategory(-42)); - } - - @Test - public void testGetCategoryInvalid2() { - assertNull(wiki.getCategory(Integer.MAX_VALUE)); - } - - @Test - public void testGetCategoriesByPageTitle() { - int expectedCategoryPageId = 9; - String expectedCategoryTitle = "Publications of UKP"; - try { - Set<Category> categories = wiki.getCategories(A_FAMOUS_PAGE); - assertNotNull(categories); - assertFalse(categories.isEmpty()); - assertEquals(1, categories.size()); - Category c = categories.iterator().next(); - assertNotNull(c); - assertEquals(expectedCategoryPageId, c.getPageId()); - assertEquals(expectedCategoryTitle, c.getTitle().toString()); - } catch (WikiTitleParsingException e) { - fail("A WikiTitleParsingException occurred while getting the categories of a page by its title"); - } catch (WikiPageNotFoundException e) { - fail("A WikiPageNotFoundException occurred while getting the categories of a page by its title"); - } - } - - @Test - public void testGetCategoriesByPageTitleInvalid1() { - try { - wiki.getCategories(""); - } catch (WikiPageNotFoundException wpnfe) { - // this is expected here - } catch (RuntimeException re) { - fail("Expected a WikiPageNotFoundException, yet encountered RuntimeException: " + re.getLocalizedMessage()); - } - } - - @Test - public void testGetCategoriesByPageTitleInvalid2() { - try { - wiki.getCategories(null); - } catch (WikiPageNotFoundException wpnfe) { - // this is expected here - } catch (RuntimeException re) { - fail("Expected a WikiPageNotFoundException, yet encountered RuntimeException: " + re.getLocalizedMessage()); - } - } - - - @Test - public void testGetLanguage() { - assertNotNull(wiki.getLanguage()); - } - - /* INTERNAL TEST HELPER METHODS */ - - private void getNotExistingPage(String title) { - boolean exceptionThrown = false; - try { - wiki.getPage(title); - } catch (WikiApiException e) { - exceptionThrown = true; - } - assertTrue(exceptionThrown, "Testing the WikiApiException for non existing page: " + title); - } - - private void getExistingPage(String title, int pageId) { +public class WikipediaTest + extends BaseJWPLTest +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private static Wikipedia wiki; + + private static final String A_FAMOUS_PAGE = "Exploring_the_Potential_of_Semantic_Relatedness_in_Information_Retrieval"; + private static final String A_FAMOUS_PAGE_CLEAN = "Exploring the Potential of Semantic Relatedness in Information Retrieval"; + private static final int A_FAMOUS_PAGE_ID = 1017; + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration(); + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + } + + /* + * We test the returned pages with testing their pageId and their title. We also expect a + * WikiApiException to be thrown when trying to get non existing page. + */ + @Test + public void testGetPageByTitleVariations() + { + getExistingPage(A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); + getExistingPage("Exploring_the_Potential_of_Semantic_Relatedness_in_Information_Retrieval", + A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); + getExistingPage("exploring the Potential of Semantic Relatedness in Information Retrieval", + A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); + getExistingPage("exploring_the_Potential_of_Semantic_Relatedness_in_Information_Retrieval", + A_FAMOUS_PAGE_CLEAN, A_FAMOUS_PAGE_ID); + getExistingPage("TK2", 105); + getNotExistingPage("TK2 "); + getNotExistingPage(" TK2"); + getNotExistingPage("TK4"); + getNotExistingPage(""); + getExistingPage("UKP", 1041); + /* + * TODO the following pages should NOT be found. They are found due to case insensitive + * querying + */ + // getNotExistingPage("Ukp"); + // getNotExistingPage("UkP"); + // getNotExistingPage("uKP"); + } + + @Test + public void testGetPageByID() + { + checkPage(A_FAMOUS_PAGE_ID, null); + } + + @Test + public void testGetPageByTitle() + { + checkPage(null, A_FAMOUS_PAGE); + } + + @Test + public void testGetPageByExactTitle() + { + try { + checkGetPageByExactTitle("UKP"); + } + catch (WikiPageNotFoundException nfe) { + fail("Encountered WikiPageNotFoundException: " + nfe.getLocalizedMessage()); + } + catch (WikiApiException ae) { + fail("Encountered WikiApiException: " + ae.getLocalizedMessage()); + } + } + + @Test + public void testGetPageByExactTitleNull() + { + try { + checkGetPageByExactTitle(null); + } + catch (WikiPageNotFoundException nfe) { + // this is expected here + } + catch (WikiApiException ae) { + fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " + + ae.getLocalizedMessage()); + } + } + + @Test + public void testGetPageByExactTitleEmpty() + { + try { + checkGetPageByExactTitle(""); + } + catch (WikiPageNotFoundException nfe) { + // this is expected here + } + catch (WikiApiException ae) { + fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " + + ae.getLocalizedMessage()); + } + } + + @Test + public void testGetPagesByTitle() + { + try { + Set<Page> pages = wiki.getPages(A_FAMOUS_PAGE); + assertNotNull(pages); + assertEquals(1, pages.size()); + assertEquals(A_FAMOUS_PAGE_ID, pages.iterator().next().getPageId()); + } + catch (WikiApiException e) { + logger.error(e.getLocalizedMessage(), e); + fail("Encountered WikiApiException: " + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPageIdsByTitle() + { + try { + List<Integer> pageIDs = wiki.getPageIds(A_FAMOUS_PAGE); + assertNotNull(pageIDs); + assertEquals(1, pageIDs.size()); + assertTrue(pageIDs.contains(A_FAMOUS_PAGE_ID)); + } + catch (WikiApiException e) { + logger.error(e.getLocalizedMessage(), e); + fail("Encountered WikiApiException: " + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPageIdsByTitleInvalid() + { + try { + wiki.getPageIds(UUID.randomUUID().toString()); + } + catch (WikiPageNotFoundException wpnfe) { + // this is expected here + } + catch (WikiApiException wae) { + fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " + + wae.getLocalizedMessage()); + } + } + + @Test + public void testGetPageIdsCaseInsensitive() + { + try { + List<Integer> pageIDs = wiki.getPageIdsCaseInsensitive(A_FAMOUS_PAGE); + assertNotNull(pageIDs); + assertEquals(1, pageIDs.size()); + assertTrue(pageIDs.contains(A_FAMOUS_PAGE_ID)); + } + catch (WikiApiException e) { + logger.error(e.getLocalizedMessage(), e); + fail("Encountered WikiApiException: " + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPageIdsCaseInsensitiveInvalid() + { + try { + wiki.getPageIdsCaseInsensitive(UUID.randomUUID().toString()); + } + catch (WikiPageNotFoundException wpnfe) { + // this is expected here + } + catch (WikiApiException wae) { + fail("Expected a WikiPageNotFoundException, yet encountered WikiApiException: " + + wae.getLocalizedMessage()); + } + } + + @Test + public void testExistsPageByTitle() + { + assertTrue(wiki.existsPage(A_FAMOUS_PAGE)); + } + + @Test + public void testExistsPageByTitleInvalid() + { + assertFalse(wiki.existsPage(A_FAMOUS_PAGE + "_")); + assertFalse(wiki.existsPage(A_FAMOUS_PAGE + " (X)")); + assertFalse(wiki.existsPage(" (X)")); + } + + @Test + public void testExistsPageByTitleNullOrEmpty() + { + assertFalse(wiki.existsPage(null)); + assertFalse(wiki.existsPage("")); + } + + @Test + public void testExistsPageByID() + { + assertTrue(wiki.existsPage(A_FAMOUS_PAGE_ID)); + } + + @Test + public void testExistsPageByIDInvalid1() + { + assertFalse(wiki.existsPage(-42)); + } + + @Test + public void testExistsPageByIDInvalid2() + { + assertFalse(wiki.existsPage(Integer.MAX_VALUE)); + } + + @Test + public void testGetTitleByID() + { + try { + Title title = wiki.getTitle(A_FAMOUS_PAGE_ID); + assertNotNull(title); + assertEquals(A_FAMOUS_PAGE, title.getRawTitleText()); + assertEquals(A_FAMOUS_PAGE_CLEAN, title.getPlainTitle()); + } + catch (WikiPageNotFoundException nfe) { + fail("Encountered WikiPageNotFoundException: " + nfe.getLocalizedMessage()); + } + catch (WikiApiException ae) { + fail("Encountered WikiApiException: " + ae.getLocalizedMessage()); + } + } + + @Test + public void testGetTitleByIDInvalid() + { + try { + wiki.getTitle(-42); + } + catch (WikiPageNotFoundException wpnfe) { + // this is expected here + } + catch (WikiApiException e) { + fail("Expected a WikiApiException, yet encountered WikiApiException: " + + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPageIds() + { + Iterable<Integer> iterable = wiki.getPageIds(); + assertNotNull(iterable); + } + + @Test + public void testGetPagesByPageQuery() + { + try { + PageQuery query = new PageQuery(); + // expected: ONE match + query.setTitlePattern(A_FAMOUS_PAGE + "%"); + Iterable<Page> iterable = wiki.getPages(query); + assertNotNull(iterable); + Page page = iterable.iterator().next(); + assertEquals(A_FAMOUS_PAGE_ID, page.getPageId()); + + // expected: ONE match + query.setOnlyArticlePages(true); + iterable = wiki.getPages(query); + assertNotNull(iterable); + page = iterable.iterator().next(); + assertEquals(A_FAMOUS_PAGE_ID, page.getPageId()); + + // expected: NO match + query.setOnlyArticlePages(false); + query.setOnlyDisambiguationPages(true); + iterable = wiki.getPages(query); + assertNotNull(iterable); + assertFalse(iterable.iterator().hasNext()); + + // expected: NO match + query.setTitlePattern(A_FAMOUS_PAGE + "_"); + iterable = wiki.getPages(query); + assertNotNull(iterable); + assertFalse(iterable.iterator().hasNext()); + } + catch (WikiApiException e) { + fail("Encountered WikiApiException: " + e.getLocalizedMessage()); + } + } + + @Test + public void testGetPageHibernateId() + { + long objectID = wiki.__getPageHibernateId(A_FAMOUS_PAGE_ID); + assertTrue(objectID > 0); + + // query a 2nd time to validate caching of IDs + assertEquals(objectID, wiki.__getPageHibernateId(A_FAMOUS_PAGE_ID)); + } + + @Test + public void testGetPageHibernateIdInvalid1() + { + long objectID = wiki.__getPageHibernateId(-42); + assertEquals(-1, objectID); + } + + @Test + public void testGetPageHibernateIdInvalid2() + { + long objectID = wiki.__getPageHibernateId(Integer.MAX_VALUE); + assertEquals(-1, objectID); + } + + @Test + public void testGetCategoryInvalid1() + { + assertNull(wiki.getCategory(-42)); + } + + @Test + public void testGetCategoryInvalid2() + { + assertNull(wiki.getCategory(Integer.MAX_VALUE)); + } + + @Test + public void testGetCategoriesByPageTitle() + { + int expectedCategoryPageId = 9; + String expectedCategoryTitle = "Publications of UKP"; + try { + Set<Category> categories = wiki.getCategories(A_FAMOUS_PAGE); + assertNotNull(categories); + assertFalse(categories.isEmpty()); + assertEquals(1, categories.size()); + Category c = categories.iterator().next(); + assertNotNull(c); + assertEquals(expectedCategoryPageId, c.getPageId()); + assertEquals(expectedCategoryTitle, c.getTitle().toString()); + } + catch (WikiTitleParsingException e) { + fail("A WikiTitleParsingException occurred while getting the categories of a page by its title"); + } + catch (WikiPageNotFoundException e) { + fail("A WikiPageNotFoundException occurred while getting the categories of a page by its title"); + } + } + + @Test + public void testGetCategoriesByPageTitleInvalid1() + { + try { + wiki.getCategories(""); + } + catch (WikiPageNotFoundException wpnfe) { + // this is expected here + } + catch (RuntimeException re) { + fail("Expected a WikiPageNotFoundException, yet encountered RuntimeException: " + + re.getLocalizedMessage()); + } + } + + @Test + public void testGetCategoriesByPageTitleInvalid2() + { + try { + wiki.getCategories(null); + } + catch (WikiPageNotFoundException wpnfe) { + // this is expected here + } + catch (RuntimeException re) { + fail("Expected a WikiPageNotFoundException, yet encountered RuntimeException: " + + re.getLocalizedMessage()); + } + } + + @Test + public void testGetLanguage() + { + assertNotNull(wiki.getLanguage()); + } + + /* INTERNAL TEST HELPER METHODS */ + + private void getNotExistingPage(String title) + { + boolean exceptionThrown = false; + try { + wiki.getPage(title); + } + catch (WikiApiException e) { + exceptionThrown = true; + } + assertTrue(exceptionThrown, "Testing the WikiApiException for non existing page: " + title); + } + + private void getExistingPage(String title, int pageId) + { getExistingPage(title, title, pageId); } - private void getExistingPage(String keyword, String title, int pageId) { - Page p = null; - try { - p = wiki.getPage(keyword); - } catch (WikiApiException e) { - fail("A WikiApiException occurred while getting the page: '" + keyword + "'", e); - } - - assertEquals(pageId, p.getPageId(), "testing the pageId of '" + title + "'"); - - try { - assertEquals(title.trim(), p.getTitle().toString(), "testing the title of '" + title + "'"); - } catch (WikiTitleParsingException e) { - fail("A WikiTitleParsingException occurred while getting the title of " + title, e); - } - } - - private void checkPage(Integer pageID, String pageTitle) { - Page page; - try { - if(pageID != null) { - page = wiki.getPage(pageID); - } else if(pageTitle!=null) { - page = wiki.getPage(pageTitle); - } else { - throw new WikiApiException("Neither pageId nor pageTitle were used to get a Page?!"); - } - assertNotNull(page); - assertEquals(A_FAMOUS_PAGE_ID, page.getPageId()); - Title title = page.getTitle(); - assertNotNull(title); - assertEquals(A_FAMOUS_PAGE, title.getRawTitleText()); - assertEquals(A_FAMOUS_PAGE_CLEAN, title.getPlainTitle()); - } catch (WikiPageNotFoundException nfe) { - fail("Encountered WikiPageNotFoundException: " + nfe.getLocalizedMessage(), nfe); - } catch (WikiApiException ae) { - fail("Encountered WikiApiException: " + ae.getLocalizedMessage(), ae); - } - } - - private void checkGetPageByExactTitle(String pageTitle) throws WikiApiException { - Page page = wiki.getPageByExactTitle(pageTitle); - assertNotNull(page); - assertEquals(1041, page.getPageId()); - Title title = page.getTitle(); - assertNotNull(title); - assertEquals("UKP", title.getRawTitleText()); - assertEquals("UKP", title.getPlainTitle()); - } + private void getExistingPage(String keyword, String title, int pageId) + { + Page p = null; + try { + p = wiki.getPage(keyword); + } + catch (WikiApiException e) { + fail("A WikiApiException occurred while getting the page: '" + keyword + "'", e); + } + + assertEquals(pageId, p.getPageId(), "testing the pageId of '" + title + "'"); + + try { + assertEquals(title.trim(), p.getTitle().toString(), + "testing the title of '" + title + "'"); + } + catch (WikiTitleParsingException e) { + fail("A WikiTitleParsingException occurred while getting the title of " + title, e); + } + } + + private void checkPage(Integer pageID, String pageTitle) + { + Page page; + try { + if (pageID != null) { + page = wiki.getPage(pageID); + } + else if (pageTitle != null) { + page = wiki.getPage(pageTitle); + } + else { + throw new WikiApiException( + "Neither pageId nor pageTitle were used to get a Page?!"); + } + assertNotNull(page); + assertEquals(A_FAMOUS_PAGE_ID, page.getPageId()); + Title title = page.getTitle(); + assertNotNull(title); + assertEquals(A_FAMOUS_PAGE, title.getRawTitleText()); + assertEquals(A_FAMOUS_PAGE_CLEAN, title.getPlainTitle()); + } + catch (WikiPageNotFoundException nfe) { + fail("Encountered WikiPageNotFoundException: " + nfe.getLocalizedMessage(), nfe); + } + catch (WikiApiException ae) { + fail("Encountered WikiApiException: " + ae.getLocalizedMessage(), ae); + } + } + + private void checkGetPageByExactTitle(String pageTitle) throws WikiApiException + { + Page page = wiki.getPageByExactTitle(pageTitle); + assertNotNull(page); + assertEquals(1041, page.getPageId()); + Title title = page.getTitle(); + assertNotNull(title); + assertEquals("UKP", title.getRawTitleText()); + assertEquals("UKP", title.getPlainTitle()); + } } diff --git a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java index 7a4cc8d4..b47449f3 100644 --- a/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java +++ b/dkpro-jwpl-api/src/test/java/org/dkpro/jwpl/api/util/GraphSerializationTest.java @@ -39,136 +39,140 @@ * Tests for the correctness of the Category graph construction and its serialization<br> * process. */ -public class GraphSerializationTest extends BaseJWPLTest { +public class GraphSerializationTest + extends BaseJWPLTest +{ final String serializationFileName = "testCategoryGraph.ser"; private static Wikipedia wiki; /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions */ @BeforeAll - public static void setupWikipedia() { + public static void setupWikipedia() + { DatabaseConfiguration db = obtainHSDLDBConfiguration(); try { wiki = new Wikipedia(db); - } catch (Exception e) { + } + catch (Exception e) { fail("Wikipedia could not be initialized."); } } - @BeforeEach - public void cleanupBeforeTest() { + public void cleanupBeforeTest() + { File serializationFile = new File(serializationFileName); serializationFile.delete(); } @AfterEach - public void cleanupAfterTest() { + public void cleanupAfterTest() + { File serializationFile = new File(serializationFileName); serializationFile.delete(); } - /** * Creates a CategoryGraph object using the Wikipedia object as parameter.<br> * Tests the correctness of the constructed graph. */ @Test - public void testGraphSerialization() { - assertDoesNotThrow(() -> { - CategoryGraph sourceGraph = CategoryGraphManager - .getCategoryGraph(wiki); - testGraph(sourceGraph.getGraph()); - sourceGraph.saveGraph(serializationFileName); - - CategoryGraph loadedGraph = new CategoryGraph(wiki, new File( - serializationFileName)); - testGraph(loadedGraph.getGraph()); + public void testGraphSerialization() + { + assertDoesNotThrow(() -> { + CategoryGraph sourceGraph = CategoryGraphManager.getCategoryGraph(wiki); + testGraph(sourceGraph.getGraph()); + sourceGraph.saveGraph(serializationFileName); + + CategoryGraph loadedGraph = new CategoryGraph(wiki, new File(serializationFileName)); + testGraph(loadedGraph.getGraph()); }); } /** * Compares the given graph with the expected graph. Returns true only if both<br> * graphs are identical. + * * @param graph */ - private void testGraph(DefaultDirectedGraph<Integer,DefaultEdge> graph){ - //make sure all vertices are there - for(int i=1;i<16;i++){ - if(!graph.containsVertex(i)) { + private void testGraph(DefaultDirectedGraph<Integer, DefaultEdge> graph) + { + // make sure all vertices are there + for (int i = 1; i < 16; i++) { + if (!graph.containsVertex(i)) { fail("Graph does not contain vertex " + i); } } - if(!graph.containsVertex(30)) { + if (!graph.containsVertex(30)) { fail("Graph does not contain vertex " + 200); } - if(!graph.containsVertex(200)) { + if (!graph.containsVertex(200)) { fail("Graph does not contain vertex " + 200); } - //make sure there are no supplemental vertices + // make sure there are no supplemental vertices assertEquals(17, graph.vertexSet().size()); - //make sure all edges are there - if(!graph.containsEdge(1,200)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(1,2)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(1,4)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(1,3)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(1,5)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(3,6)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(4,9)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(5,8)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(6,9)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(6,8)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(6,7)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(7,11)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(7,10)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(8,15)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(8,13)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(8,14)) { - fail("Graph does not contain edge"); - } - if(!graph.containsEdge(8,12)) { - fail("Graph does not contain edge"); - } - - //make sure there no supplemental edges + // make sure all edges are there + if (!graph.containsEdge(1, 200)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(1, 2)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(1, 4)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(1, 3)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(1, 5)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(3, 6)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(4, 9)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(5, 8)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(6, 9)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(6, 8)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(6, 7)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(7, 11)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(7, 10)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(8, 15)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(8, 13)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(8, 14)) { + fail("Graph does not contain edge"); + } + if (!graph.containsEdge(8, 12)) { + fail("Graph does not contain edge"); + } + + // make sure there no supplemental edges assertEquals(17, graph.edgeSet().size()); } From 6ae5a8d5fc7270e5b72e9d6b8a8217180b490023 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho <richard.eckart@gmail.com> Date: Tue, 31 Oct 2023 14:25:56 +0100 Subject: [PATCH 06/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-datamachine --- .../datamachine/domain/DataMachineFiles.java | 326 +++++---- .../domain/DataMachineGenerator.java | 204 +++--- .../datamachine/domain/JWPLDataMachine.java | 97 +-- .../version/SingleDumpVersionJDKGeneric.java | 416 ++++++----- .../SingleDumpVersionJDKIntKeyFactory.java | 25 +- .../SingleDumpVersionJDKLongKeyFactory.java | 27 +- .../SingleDumpVersionJDKStringKeyFactory.java | 27 +- .../version/SingleDumpVersionOriginal.java | 686 +++++++++--------- .../dump/xml/BinaryDumpTableInputStream.java | 26 +- .../dump/xml/DataMachineRevisionParser.java | 26 +- .../dump/xml/SimpleBinaryDumpWriter.java | 235 +++--- .../dump/xml/SimpleXmlDumpReader.java | 59 +- .../jwpl/datamachine/dump/xml/XML2Binary.java | 43 +- .../file/DeleteFilesAtShutdown.java | 73 +- 14 files changed, 1206 insertions(+), 1064 deletions(-) diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java index 1ae33455..d472e6b0 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineFiles.java @@ -22,160 +22,176 @@ import org.dkpro.jwpl.wikimachine.debug.ILogger; import org.dkpro.jwpl.wikimachine.domain.Files; -public class DataMachineFiles extends Files { - private final static String INPUT_PAGELINKS = "pagelinks.sql"; - private final static String INPUT_PAGESARTICLES = "pages-articles.xml"; - private final static String INPUT_CATEGORYLINKS = "categorylinks.sql"; - private final static String INPUT_PAGESMETACURRENT = "pages-meta-current.xml"; - - private final static String GENERATED_PAGE = "page.bin"; - private final static String GENERATED_REVISION = "revision.bin"; - private final static String GENERATED_TEXT = "text.bin"; - /* - * discussions.bin is currently unused. Discussions are put in pages.bin - */ - private final static String GENERATED_DISCUSSIONS = "discussions.bin"; - - private final static String ARCHIVE_EXTENSION = ".gz"; - - private File dataDirectory = new File("."); - private boolean compressGeneratedFiles = false; - - private File inputPagelinks = null; - private File inputPagesarticles = null; - private File inputCategorylinks = null; - private File inputPagesMetaCurrent = null; - - public DataMachineFiles(ILogger logger) { - super(logger); - outputDirectory = setOutputDirectory(dataDirectory); - } - - public DataMachineFiles(DataMachineFiles files) { - super(files); - this.dataDirectory = files.dataDirectory; - this.compressGeneratedFiles = files.compressGeneratedFiles; - this.inputPagelinks = files.inputPagelinks; - this.inputPagesarticles = files.inputPagesarticles; - this.inputCategorylinks = files.inputCategorylinks; - this.inputPagesMetaCurrent = files.inputPagesMetaCurrent; - } - - private File setOutputDirectory(File parentDirectory) { - File result = new File(parentDirectory.getAbsolutePath() - + File.separator + OUTPUT_DIRECTORY); - - return result; - } - - public void setDataDirectory(String newDataDirectory) { - File inputDataDirectory = new File(newDataDirectory); - if (inputDataDirectory.isDirectory()) { - this.dataDirectory = inputDataDirectory; - this.outputDirectory = setOutputDirectory(dataDirectory); - } else { - logger.log(dataDirectory - + " is not a directory. Continue read from " - + this.dataDirectory.getAbsolutePath()); - } - - } - - public boolean checkDatamachineSourceFiles() { - File[] filesInDataDirectory = dataDirectory.listFiles(); - if (filesInDataDirectory.length > 2) { - for (File currentFile : filesInDataDirectory) { - - //TODO improve file check. Only accept files that come in a supported compression format - String currentFileName = currentFile.getName(); - if (currentFileName.contains(INPUT_PAGESARTICLES)) { - inputPagesarticles = currentFile; - } else if (currentFileName.contains(INPUT_PAGELINKS)) { - inputPagelinks = currentFile; - } else if (currentFileName.contains(INPUT_CATEGORYLINKS)) { - inputCategorylinks = currentFile; - } else if (currentFileName.contains(INPUT_PAGESMETACURRENT)) { - inputPagesMetaCurrent = currentFile; +public class DataMachineFiles + extends Files +{ + private final static String INPUT_PAGELINKS = "pagelinks.sql"; + private final static String INPUT_PAGESARTICLES = "pages-articles.xml"; + private final static String INPUT_CATEGORYLINKS = "categorylinks.sql"; + private final static String INPUT_PAGESMETACURRENT = "pages-meta-current.xml"; + + private final static String GENERATED_PAGE = "page.bin"; + private final static String GENERATED_REVISION = "revision.bin"; + private final static String GENERATED_TEXT = "text.bin"; + /* + * discussions.bin is currently unused. Discussions are put in pages.bin + */ + private final static String GENERATED_DISCUSSIONS = "discussions.bin"; + + private final static String ARCHIVE_EXTENSION = ".gz"; + + private File dataDirectory = new File("."); + private boolean compressGeneratedFiles = false; + + private File inputPagelinks = null; + private File inputPagesarticles = null; + private File inputCategorylinks = null; + private File inputPagesMetaCurrent = null; + + public DataMachineFiles(ILogger logger) + { + super(logger); + outputDirectory = setOutputDirectory(dataDirectory); + } + + public DataMachineFiles(DataMachineFiles files) + { + super(files); + this.dataDirectory = files.dataDirectory; + this.compressGeneratedFiles = files.compressGeneratedFiles; + this.inputPagelinks = files.inputPagelinks; + this.inputPagesarticles = files.inputPagesarticles; + this.inputCategorylinks = files.inputCategorylinks; + this.inputPagesMetaCurrent = files.inputPagesMetaCurrent; + } + + private File setOutputDirectory(File parentDirectory) + { + File result = new File( + parentDirectory.getAbsolutePath() + File.separator + OUTPUT_DIRECTORY); + + return result; + } + + public void setDataDirectory(String newDataDirectory) + { + File inputDataDirectory = new File(newDataDirectory); + if (inputDataDirectory.isDirectory()) { + this.dataDirectory = inputDataDirectory; + this.outputDirectory = setOutputDirectory(dataDirectory); + } + else { + logger.log(dataDirectory + " is not a directory. Continue read from " + + this.dataDirectory.getAbsolutePath()); + } + + } + + public boolean checkDatamachineSourceFiles() + { + File[] filesInDataDirectory = dataDirectory.listFiles(); + if (filesInDataDirectory.length > 2) { + for (File currentFile : filesInDataDirectory) { + + // TODO improve file check. Only accept files that come in a supported compression + // format + String currentFileName = currentFile.getName(); + if (currentFileName.contains(INPUT_PAGESARTICLES)) { + inputPagesarticles = currentFile; + } + else if (currentFileName.contains(INPUT_PAGELINKS)) { + inputPagelinks = currentFile; + } + else if (currentFileName.contains(INPUT_CATEGORYLINKS)) { + inputCategorylinks = currentFile; + } + else if (currentFileName.contains(INPUT_PAGESMETACURRENT)) { + inputPagesMetaCurrent = currentFile; + } + } + } + // either inputPagesarticles or inputPagesMetaCurrent have to be placed + // in the input directory + return !((inputPagesarticles == null && inputPagesMetaCurrent == null) + || inputPagelinks == null || inputCategorylinks == null); + } + + public String getGeneratedPage() + { + return getGeneratedPath(GENERATED_PAGE); + } + + public String getGeneratedRevision() + { + return getGeneratedPath(GENERATED_REVISION); + } + + public String getGeneratedText() + { + return getGeneratedPath(GENERATED_TEXT); + } + + public String getGeneratedDiscussions() + { + return getGeneratedPath(GENERATED_DISCUSSIONS); + } + + public String getInputPageLinks() + { + return (inputPagelinks != null) ? inputPagelinks.getAbsolutePath() : null; + } + + public String getInputPagesArticles() + { + return (inputPagesarticles != null) ? inputPagesarticles.getAbsolutePath() : null; + } + + public String getInputCategoryLinks() + { + return (inputCategorylinks != null) ? inputCategorylinks.getAbsolutePath() : null; + } + + public String getInputPagesMetaCurrent() + { + return (inputPagesMetaCurrent != null) ? inputPagesMetaCurrent.getAbsolutePath() : null; + } + + private String getGeneratedPath(String fileName) + { + String path = dataDirectory.getAbsolutePath() + File.separator + fileName; + if (compressGeneratedFiles) { + path = path.concat(ARCHIVE_EXTENSION); } - } - } - // either inputPagesarticles or inputPagesMetaCurrent have to be placed - // in the input directory - return !((inputPagesarticles == null && inputPagesMetaCurrent == null) - || inputPagelinks == null || inputCategorylinks == null); - } - - public String getGeneratedPage() { - return getGeneratedPath(GENERATED_PAGE); - } - - public String getGeneratedRevision() { - return getGeneratedPath(GENERATED_REVISION); - } - - public String getGeneratedText() { - return getGeneratedPath(GENERATED_TEXT); - } - - public String getGeneratedDiscussions() { - return getGeneratedPath(GENERATED_DISCUSSIONS); - } - - public String getInputPageLinks() { - return (inputPagelinks != null) ? inputPagelinks.getAbsolutePath() - : null; - } - - public String getInputPagesArticles() { - return (inputPagesarticles != null) ? inputPagesarticles - .getAbsolutePath() : null; - } - - public String getInputCategoryLinks() { - return (inputCategorylinks != null) ? inputCategorylinks - .getAbsolutePath() : null; - } - - public String getInputPagesMetaCurrent() { - return (inputPagesMetaCurrent != null) ? inputPagesMetaCurrent - .getAbsolutePath() : null; - } - - - private String getGeneratedPath(String fileName) { - String path = dataDirectory.getAbsolutePath() + File.separator - + fileName; - if (compressGeneratedFiles) { - path = path.concat(ARCHIVE_EXTENSION); - } - return path; - } - - /** - * @see DataMachineFiles#setCompressGeneratedFiles(boolean) - */ - public boolean isCompressGeneratedFiles() { - return compressGeneratedFiles; - } - - /** - * Set the input parameter to {@code true} it you want to GZip the temporary - * files and save a disk space. <b>Attention:</b> {@code DataInputStream} - * can have problems reading from a compressed file. This can be a reason - * for strange side effects like heap overflow or some other exceptions. <br> - * For UKP-Developers: you can save much more disk space if you'll parse the - * page-articles XML Dump every time you need it: during processPage(), - * processRevision() and processText(). See TimeMachine solution especially - * the package org.dkpro.jwpl.timemachine.dump.xml - * - * @param compressGeneratedFiles - */ - public void setCompressGeneratedFiles(boolean compressGeneratedFiles) { - this.compressGeneratedFiles = compressGeneratedFiles; - } - - @Override - public boolean checkAll() { - return checkOutputDirectory() && checkDatamachineSourceFiles(); - } + return path; + } + + /** + * @see DataMachineFiles#setCompressGeneratedFiles(boolean) + */ + public boolean isCompressGeneratedFiles() + { + return compressGeneratedFiles; + } + + /** + * Set the input parameter to {@code true} it you want to GZip the temporary files and save a + * disk space. <b>Attention:</b> {@code DataInputStream} can have problems reading from a + * compressed file. This can be a reason for strange side effects like heap overflow or some + * other exceptions. <br> + * For UKP-Developers: you can save much more disk space if you'll parse the page-articles XML + * Dump every time you need it: during processPage(), processRevision() and processText(). See + * TimeMachine solution especially the package org.dkpro.jwpl.timemachine.dump.xml + * + * @param compressGeneratedFiles + */ + public void setCompressGeneratedFiles(boolean compressGeneratedFiles) + { + this.compressGeneratedFiles = compressGeneratedFiles; + } + + @Override + public boolean checkAll() + { + return checkOutputDirectory() && checkDatamachineSourceFiles(); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java index e6f1c4b4..8b4d6304 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/DataMachineGenerator.java @@ -35,131 +35,141 @@ /** * Transforms a database from mediawiki format to JWPL format.<br> - * The transformation produces .txt files for the different tables in the JWPL - * database. + * The transformation produces .txt files for the different tables in the JWPL database. */ -public class DataMachineGenerator extends AbstractSnapshotGenerator { +public class DataMachineGenerator + extends AbstractSnapshotGenerator +{ - DataMachineFiles files = null; - IDumpVersion version = null; + DataMachineFiles files = null; + IDumpVersion version = null; - public DataMachineGenerator(IEnvironmentFactory environmentFactory) { - super(environmentFactory); - } - - @Override - public void setFiles(Files files) { - this.files = (DataMachineFiles) files; - } - - @Override - public void start() throws Exception { - version = environmentFactory.getDumpVersion(); - MetaData metaData = MetaData.initWithConfig(configuration); - version.initialize(null); - version.setMetaData(metaData); - version.setFiles(files); - processInputDump(); - } - - private void processInputDump() throws IOException { + public DataMachineGenerator(IEnvironmentFactory environmentFactory) + { + super(environmentFactory); + } - logger.log("parse input dumps..."); - new XML2Binary(decompressor.getInputStream(getPagesArticlesFile()), - files); + @Override + public void setFiles(Files files) + { + this.files = (DataMachineFiles) files; + } + @Override + public void start() throws Exception + { + version = environmentFactory.getDumpVersion(); + MetaData metaData = MetaData.initWithConfig(configuration); + version.initialize(null); + version.setMetaData(metaData); + version.setFiles(files); + processInputDump(); + } - dumpVersionProcessor.setDumpVersions(new IDumpVersion[]{version}); + private void processInputDump() throws IOException + { - logger.log("processing table page..."); - dumpVersionProcessor.processPage(createPageParser()); + logger.log("parse input dumps..."); + new XML2Binary(decompressor.getInputStream(getPagesArticlesFile()), files); - logger.log("processing table categorylinks..."); - dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); + dumpVersionProcessor.setDumpVersions(new IDumpVersion[] { version }); - logger.log("processing table pagelinks..."); - dumpVersionProcessor.processPagelinks(createPagelinksParser()); + logger.log("processing table page..."); + dumpVersionProcessor.processPage(createPageParser()); - logger.log("processing table revision..."); - dumpVersionProcessor.processRevision(createRevisionParser()); + logger.log("processing table categorylinks..."); + dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); - logger.log("processing table text..."); - dumpVersionProcessor.processText(createTextParser()); + logger.log("processing table pagelinks..."); + dumpVersionProcessor.processPagelinks(createPagelinksParser()); - logger.log("writing metadata..."); - dumpVersionProcessor.writeMetaData(); + logger.log("processing table revision..."); + dumpVersionProcessor.processRevision(createRevisionParser()); - logger.log("finished"); - } + logger.log("processing table text..."); + dumpVersionProcessor.processText(createTextParser()); - /** - * Parse either "pages-articles.xml" or "pages-meta-current.xml". If both - * files exist in the input directory "pages-meta-current.xml" will be - * favored. - * - * @return the input articles dump - */ - private String getPagesArticlesFile() { - String pagesArticlesFile = null; - String parseMessage = null; + logger.log("writing metadata..."); + dumpVersionProcessor.writeMetaData(); - //Use of minimal dump only with articles - if (files.getInputPagesArticles() != null) { - pagesArticlesFile = files.getInputPagesArticles(); - parseMessage = "Discussions are unavailable"; + logger.log("finished"); } - //Use of dump with discussions - if (files.getInputPagesMetaCurrent() != null) { - pagesArticlesFile = files.getInputPagesMetaCurrent(); - parseMessage = "Discussions are available"; + /** + * Parse either "pages-articles.xml" or "pages-meta-current.xml". If both files exist in the + * input directory "pages-meta-current.xml" will be favored. + * + * @return the input articles dump + */ + private String getPagesArticlesFile() + { + String pagesArticlesFile = null; + String parseMessage = null; + + // Use of minimal dump only with articles + if (files.getInputPagesArticles() != null) { + pagesArticlesFile = files.getInputPagesArticles(); + parseMessage = "Discussions are unavailable"; + } + + // Use of dump with discussions + if (files.getInputPagesMetaCurrent() != null) { + pagesArticlesFile = files.getInputPagesMetaCurrent(); + parseMessage = "Discussions are available"; + } + + logger.log(parseMessage); + return pagesArticlesFile; } - logger.log(parseMessage); - return pagesArticlesFile; - } + private PageParser createPageParser() throws IOException + { + String pageFile = files.getGeneratedPage(); - private PageParser createPageParser() throws IOException { - String pageFile = files.getGeneratedPage(); + DumpTableInputStream pageTableInputStream = environmentFactory.getDumpTableInputStream(); + pageTableInputStream.initialize(decompressor.getInputStream(pageFile), DumpTableEnum.PAGE); - DumpTableInputStream pageTableInputStream = environmentFactory.getDumpTableInputStream(); - pageTableInputStream.initialize(decompressor.getInputStream(pageFile), DumpTableEnum.PAGE); - - PageParser pageParser = environmentFactory.getPageParser(); - pageParser.setInputStream(pageTableInputStream); - return pageParser; - } + PageParser pageParser = environmentFactory.getPageParser(); + pageParser.setInputStream(pageTableInputStream); + return pageParser; + } - private CategorylinksParser createCategorylinksParser() throws IOException { - String categorylinksFile = files.getInputCategoryLinks(); - return new CategorylinksParser(decompressor.getInputStream(categorylinksFile)); - } + private CategorylinksParser createCategorylinksParser() throws IOException + { + String categorylinksFile = files.getInputCategoryLinks(); + return new CategorylinksParser(decompressor.getInputStream(categorylinksFile)); + } - private PagelinksParser createPagelinksParser() throws IOException { - String pagelinksFile = files.getInputPageLinks(); - return new PagelinksParser(decompressor.getInputStream(pagelinksFile)); - } + private PagelinksParser createPagelinksParser() throws IOException + { + String pagelinksFile = files.getInputPageLinks(); + return new PagelinksParser(decompressor.getInputStream(pagelinksFile)); + } - private RevisionParser createRevisionParser() throws IOException { - String revisionFile = files.getGeneratedRevision(); + private RevisionParser createRevisionParser() throws IOException + { + String revisionFile = files.getGeneratedRevision(); - DumpTableInputStream revisionTableInputStream = environmentFactory.getDumpTableInputStream(); - revisionTableInputStream.initialize(decompressor.getInputStream(revisionFile), DumpTableEnum.REVISION); + DumpTableInputStream revisionTableInputStream = environmentFactory + .getDumpTableInputStream(); + revisionTableInputStream.initialize(decompressor.getInputStream(revisionFile), + DumpTableEnum.REVISION); - RevisionParser revisionParser = environmentFactory.getRevisionParser(); - revisionParser.setInputStream(revisionTableInputStream); - return revisionParser; - } + RevisionParser revisionParser = environmentFactory.getRevisionParser(); + revisionParser.setInputStream(revisionTableInputStream); + return revisionParser; + } - private TextParser createTextParser() throws IOException { - String textFile = files.getGeneratedText(); + private TextParser createTextParser() throws IOException + { + String textFile = files.getGeneratedText(); - DumpTableInputStream textTableInputStream = environmentFactory.getDumpTableInputStream(); - textTableInputStream.initialize(decompressor.getInputStream(textFile), DumpTableEnum.TEXT); + DumpTableInputStream textTableInputStream = environmentFactory.getDumpTableInputStream(); + textTableInputStream.initialize(decompressor.getInputStream(textFile), DumpTableEnum.TEXT); - TextParser textParser = environmentFactory.getTextParser(); - textParser.setInputStream(textTableInputStream); - return textParser; - } + TextParser textParser = environmentFactory.getTextParser(); + textParser.setInputStream(textTableInputStream); + return textParser; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java index c2ee8b82..3aa18d4c 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/domain/JWPLDataMachine.java @@ -26,66 +26,67 @@ /** * Starts the transformation from Mediawiki dump format to JWPL dump format. */ -public class JWPLDataMachine { +public class JWPLDataMachine +{ - private static final int LANG_ARG = 0; - private static final int MAINCATEGORY_ARG = 1; - private static final int DISAMBIGUATION_ARG = 2; - private static final int DATADIR_ARG = 3; + private static final int LANG_ARG = 0; + private static final int MAINCATEGORY_ARG = 1; + private static final int DISAMBIGUATION_ARG = 2; + private static final int DATADIR_ARG = 3; - private static final String USAGE = "Please use\n" - + "\tjava -jar JWPLDataMachine.jar <LANGUAGE> <TOP_CATEGORY_NAME> <DISAMBIGUATION_CATEGORY_NAME> <SOURCE_DIRECTORY>\n\n" - + "The source directory must contain files\n" - + "\tpagelinks.sql\n" - + "\tpages-articles.xml\n" - + "\tcategorylinks.sql\n" - + "GZip or BZip2 compressed archives of above-named files are also allowed.\n" - + "Please set up a decompressor.xml for a usage of other external archive utilities (see documentation for more help).\n"; + private static final String USAGE = "Please use\n" + + "\tjava -jar JWPLDataMachine.jar <LANGUAGE> <TOP_CATEGORY_NAME> <DISAMBIGUATION_CATEGORY_NAME> <SOURCE_DIRECTORY>\n\n" + + "The source directory must contain files\n" + "\tpagelinks.sql\n" + + "\tpages-articles.xml\n" + "\tcategorylinks.sql\n" + + "GZip or BZip2 compressed archives of above-named files are also allowed.\n" + + "Please set up a decompressor.xml for a usage of other external archive utilities (see documentation for more help).\n"; - private static final long startTime = System.currentTimeMillis(); + private static final long startTime = System.currentTimeMillis(); - private static final IEnvironmentFactory environmentFactory = SpringFactory.getInstance(); + private static final IEnvironmentFactory environmentFactory = SpringFactory.getInstance(); - private static final ILogger logger = environmentFactory.getLogger(); + private static final ILogger logger = environmentFactory.getLogger(); - public static void main(String[] args) { - if (args.length > 3) { - Configuration config = getConfigFromArgs(args); - DataMachineFiles files = new DataMachineFiles(logger); - files.setDataDirectory(args[DATADIR_ARG]); - if (files.checkAll()) { - try { + public static void main(String[] args) + { + if (args.length > 3) { + Configuration config = getConfigFromArgs(args); + DataMachineFiles files = new DataMachineFiles(logger); + files.setDataDirectory(args[DATADIR_ARG]); + if (files.checkAll()) { + try { - ISnapshotGenerator generator = environmentFactory - .getSnapshotGenerator(); - generator.setConfiguration(config); - generator.setFiles(files); - generator.start(); + ISnapshotGenerator generator = environmentFactory.getSnapshotGenerator(); + generator.setConfiguration(config); + generator.setFiles(files); + generator.start(); - logger.log("End of the application. Working time = " - + (System.currentTimeMillis() - - startTime) + " ms"); - } catch (Exception e) { - logger.log(e); + logger.log("End of the application. Working time = " + + (System.currentTimeMillis() - startTime) + " ms"); + } + catch (Exception e) { + logger.log(e); + } + } + else { + logger.log("Not all necessary source files could be found in " + args[DATADIR_ARG]); + } + + } + else { + System.out.println(USAGE); } - } else { - logger.log("Not all necessary source files could be found in " - + args[DATADIR_ARG]); - } - } else { - System.out.println(USAGE); } - } + private static Configuration getConfigFromArgs(String[] args) + { + Configuration config = new Configuration(logger); + config.setLanguage(args[LANG_ARG]); + config.setMainCategory(args[MAINCATEGORY_ARG]); + config.setDisambiguationCategory(args[DISAMBIGUATION_ARG]); - private static Configuration getConfigFromArgs(String[] args) { - Configuration config = new Configuration(logger); - config.setLanguage(args[LANG_ARG]); - config.setMainCategory(args[MAINCATEGORY_ARG]); - config.setDisambiguationCategory(args[DISAMBIGUATION_ARG]); - - return config; - } + return config; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java index 62470f55..02e8ade4 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKGeneric.java @@ -36,216 +36,236 @@ import org.dkpro.jwpl.wikimachine.util.TxtFileWriter; public class SingleDumpVersionJDKGeneric<KeyType, HashAlgorithm extends IStringHashCode> - extends AbstractDumpVersion { - - private static final String SQL_NULL = "NULL"; - //TODO This constant is used to flag page titles of discussion pages. - // Is also defined in wikipedia.api:WikiConstants.DISCUSSION_PREFIX - // It just doesn't make sense to add a dependency just for the constant - private static final String DISCUSSION_PREFIX = "Discussion:"; - - private Map<Integer, String> pPageIdNameMap; - private Set<Integer> cPageIdNameMap; - private Map<KeyType, Integer> pNamePageIdMap; - private Map<KeyType, Integer> cNamePageIdMap; - private Map<Integer, String> rPageIdNameMap; - private Set<Integer> disambiguations; - private Map<Integer, Integer> textIdPageIdMap; - - IStringHashCode hashAlgorithm; - - @SuppressWarnings("unchecked") - public SingleDumpVersionJDKGeneric(Class<HashAlgorithm> hashAlgorithmClass) - throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { - - hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); - @SuppressWarnings("unused") - KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); - } - - @Override - public void freeAfterCategoryLinksParsing() { - cPageIdNameMap.clear(); - cNamePageIdMap.clear(); - } - - @Override - public void freeAfterPageLinksParsing() { - // nothing to free - - } - - @Override - public void freeAfterPageParsing() { - metaData.setNrOfCategories(cPageIdNameMap.size()); - metaData.setNrOfPages(pPageIdNameMap.keySet().size() + rPageIdNameMap.keySet().size()); - System.out.println("nrOfCategories: " + metaData.getNrOfCategories()); - System.out.println("nrOfPage: " + metaData.getNrOfPages()); - System.out.println("nrOfRedirects before testing the validity of the destination:" + rPageIdNameMap.size()); - } - - @Override - public void freeAfterRevisonParsing() { - // nothing to free - } - - @Override - public void freeAfterTextParsing() { - pPageIdNameMap.clear(); - cPageIdNameMap.clear(); - pNamePageIdMap.clear(); - cNamePageIdMap.clear(); - rPageIdNameMap.clear(); - disambiguations.clear(); - textIdPageIdMap.clear(); - } - - @Override - public void initialize(Timestamp timestamp) { - pPageIdNameMap = new HashMap<>(1_000_000); - cPageIdNameMap = new HashSet<>(1_000_000); - pNamePageIdMap = new HashMap<>(1_000_000); - cNamePageIdMap = new HashMap<>(1_000_000); - rPageIdNameMap = new HashMap<>(1_000_000); - disambiguations = new HashSet<>(1_000_000); - textIdPageIdMap = new HashMap<>(1_000_000); - } - - @SuppressWarnings("unchecked") - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - String cl_to = clParser.getClTo(); - - if (cl_to != null) { - KeyType clToHash = (KeyType) hashAlgorithm.hashCode(cl_to); - - Integer cl_toValue = cNamePageIdMap.get(clToHash); - - if (cl_toValue != null) { - int cl_from = clParser.getClFrom(); - - if (pPageIdNameMap.containsKey(cl_from)) { - categoryPages.addRow(cl_toValue, cl_from); - pageCategories.addRow(cl_from, cl_toValue); - - if (cl_to.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from); - metaData.addDisamb(); - } - } else if (cPageIdNameMap.contains(cl_from)) { - categoryOutlinks.addRow(cl_toValue, cl_from); - categoryInlinks.addRow(cl_from, cl_toValue); - } + extends AbstractDumpVersion +{ + + private static final String SQL_NULL = "NULL"; + // TODO This constant is used to flag page titles of discussion pages. + // Is also defined in wikipedia.api:WikiConstants.DISCUSSION_PREFIX + // It just doesn't make sense to add a dependency just for the constant + private static final String DISCUSSION_PREFIX = "Discussion:"; + + private Map<Integer, String> pPageIdNameMap; + private Set<Integer> cPageIdNameMap; + private Map<KeyType, Integer> pNamePageIdMap; + private Map<KeyType, Integer> cNamePageIdMap; + private Map<Integer, String> rPageIdNameMap; + private Set<Integer> disambiguations; + private Map<Integer, Integer> textIdPageIdMap; + + IStringHashCode hashAlgorithm; + + @SuppressWarnings("unchecked") + public SingleDumpVersionJDKGeneric(Class<HashAlgorithm> hashAlgorithmClass) + throws InstantiationException, IllegalAccessException, NoSuchMethodException, + InvocationTargetException + { + + hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); + @SuppressWarnings("unused") + KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); + } + + @Override + public void freeAfterCategoryLinksParsing() + { + cPageIdNameMap.clear(); + cNamePageIdMap.clear(); + } + + @Override + public void freeAfterPageLinksParsing() + { + // nothing to free - } - } else { - throw new IOException("Parsin error." + CategorylinksParser.class.getName() + - " returned null value in " + this.getClass().getName()); } - } - - @SuppressWarnings("unchecked") - @Override - public void processPageLinksRow(PagelinksParser plParser) - throws IOException { - int pl_from = plParser.getPlFrom(); - String pl_to = plParser.getPlTo(); - if (pl_to != null) { - KeyType plToHash = (KeyType) hashAlgorithm.hashCode(pl_to); - Integer pl_toValue = pNamePageIdMap.get(plToHash); - // skip redirects if skipPage is enabled - if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) && pl_toValue != null) { - pageOutlinks.addRow(pl_from, pl_toValue); - pageInlinks.addRow(pl_toValue, pl_from); - } + + @Override + public void freeAfterPageParsing() + { + metaData.setNrOfCategories(cPageIdNameMap.size()); + metaData.setNrOfPages(pPageIdNameMap.keySet().size() + rPageIdNameMap.keySet().size()); + System.out.println("nrOfCategories: " + metaData.getNrOfCategories()); + System.out.println("nrOfPage: " + metaData.getNrOfPages()); + System.out.println("nrOfRedirects before testing the validity of the destination:" + + rPageIdNameMap.size()); + } + + @Override + public void freeAfterRevisonParsing() + { + // nothing to free + } + + @Override + public void freeAfterTextParsing() + { + pPageIdNameMap.clear(); + cPageIdNameMap.clear(); + pNamePageIdMap.clear(); + cNamePageIdMap.clear(); + rPageIdNameMap.clear(); + disambiguations.clear(); + textIdPageIdMap.clear(); } - } - - @SuppressWarnings("unchecked") - @Override - public void processPageRow(PageParser pageParser) throws IOException { - int page_namespace = pageParser.getPageNamespace(); - int page_id = pageParser.getPageId(); - String page_title = pageParser.getPageTitle(); - if (page_title != null) { - switch (page_namespace) { - case NS_CATEGORY: { - // skip redirect categories if skipCategory is enabled - if (!(skipCategory && pageParser.getPageIsRedirect())) { - cPageIdNameMap.add(page_id); - cNamePageIdMap.put((KeyType) hashAlgorithm.hashCode(page_title), page_id); - txtFW.addRow(page_id, page_id, page_title); - } - break; + + @Override + public void initialize(Timestamp timestamp) + { + pPageIdNameMap = new HashMap<>(1_000_000); + cPageIdNameMap = new HashSet<>(1_000_000); + pNamePageIdMap = new HashMap<>(1_000_000); + cNamePageIdMap = new HashMap<>(1_000_000); + rPageIdNameMap = new HashMap<>(1_000_000); + disambiguations = new HashSet<>(1_000_000); + textIdPageIdMap = new HashMap<>(1_000_000); + } + + @SuppressWarnings("unchecked") + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException + { + String cl_to = clParser.getClTo(); + + if (cl_to != null) { + KeyType clToHash = (KeyType) hashAlgorithm.hashCode(cl_to); + + Integer cl_toValue = cNamePageIdMap.get(clToHash); + + if (cl_toValue != null) { + int cl_from = clParser.getClFrom(); + + if (pPageIdNameMap.containsKey(cl_from)) { + categoryPages.addRow(cl_toValue, cl_from); + pageCategories.addRow(cl_from, cl_toValue); + + if (cl_to.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from); + metaData.addDisamb(); + } + } + else if (cPageIdNameMap.contains(cl_from)) { + categoryOutlinks.addRow(cl_toValue, cl_from); + categoryInlinks.addRow(cl_from, cl_toValue); + } + + } + } + else { + throw new IOException("Parsin error." + CategorylinksParser.class.getName() + + " returned null value in " + this.getClass().getName()); } + } - case NS_TALK: { - page_title = DISCUSSION_PREFIX + page_title; - //the NS_MAIN block will also be executed - //for NS_TALK pages ... + @SuppressWarnings("unchecked") + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException + { + int pl_from = plParser.getPlFrom(); + String pl_to = plParser.getPlTo(); + if (pl_to != null) { + KeyType plToHash = (KeyType) hashAlgorithm.hashCode(pl_to); + Integer pl_toValue = pNamePageIdMap.get(plToHash); + // skip redirects if skipPage is enabled + if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) && pl_toValue != null) { + pageOutlinks.addRow(pl_from, pl_toValue); + pageInlinks.addRow(pl_toValue, pl_from); + } } + } - case NS_MAIN: { - if (pageParser.getPageIsRedirect()) { - rPageIdNameMap.put(page_id, page_title); - } else { - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put((KeyType) hashAlgorithm.hashCode(page_title), page_id); - } - break; + @SuppressWarnings("unchecked") + @Override + public void processPageRow(PageParser pageParser) throws IOException + { + int page_namespace = pageParser.getPageNamespace(); + int page_id = pageParser.getPageId(); + String page_title = pageParser.getPageTitle(); + if (page_title != null) { + switch (page_namespace) { + case NS_CATEGORY: { + // skip redirect categories if skipCategory is enabled + if (!(skipCategory && pageParser.getPageIsRedirect())) { + cPageIdNameMap.add(page_id); + cNamePageIdMap.put((KeyType) hashAlgorithm.hashCode(page_title), page_id); + txtFW.addRow(page_id, page_id, page_title); + } + break; + } + + case NS_TALK: { + page_title = DISCUSSION_PREFIX + page_title; + // the NS_MAIN block will also be executed + // for NS_TALK pages ... + } + + case NS_MAIN: { + if (pageParser.getPageIsRedirect()) { + rPageIdNameMap.put(page_id, page_title); + } + else { + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put((KeyType) hashAlgorithm.hashCode(page_title), page_id); + } + break; + } + } } - } } - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser.getRevPage()); - } - - @SuppressWarnings("unchecked") - @Override - public void processTextRow(TextParser textParser) throws IOException { - int text_id = textParser.getOldId(); - if (textIdPageIdMap.containsKey(text_id)) { - - int page_id = textIdPageIdMap.get(text_id); - String page_idValueP = pPageIdNameMap.get(page_id); - if (page_idValueP != null) {// pages - page.addRow(page_id, page_id, page_idValueP, textParser.getOldText(), formatBoolean(disambiguations - .contains(page_id))); - pageMapLine.addRow(page_id, page_idValueP, page_id, SQL_NULL, SQL_NULL); - - } else { - String page_idValueR = rPageIdNameMap.get(page_id); - if (page_idValueR != null) {// Redirects - String destination = Redirects.getRedirectDestination(textParser.getOldText()); - if (destination != null) { - KeyType destinationHash = (KeyType) hashAlgorithm.hashCode(destination); - Integer destinationValue = pNamePageIdMap.get(destinationHash); - if (destinationValue != null) { - - pageRedirects.addRow(destinationValue, page_idValueR); - pageMapLine.addRow(page_id, page_idValueR, destinationValue, SQL_NULL, SQL_NULL); - metaData.addRedirect(); + + @Override + public void processRevisionRow(RevisionParser revisionParser) + { + textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser.getRevPage()); + } + + @SuppressWarnings("unchecked") + @Override + public void processTextRow(TextParser textParser) throws IOException + { + int text_id = textParser.getOldId(); + if (textIdPageIdMap.containsKey(text_id)) { + + int page_id = textIdPageIdMap.get(text_id); + String page_idValueP = pPageIdNameMap.get(page_id); + if (page_idValueP != null) {// pages + page.addRow(page_id, page_id, page_idValueP, textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, page_idValueP, page_id, SQL_NULL, SQL_NULL); + + } + else { + String page_idValueR = rPageIdNameMap.get(page_id); + if (page_idValueR != null) {// Redirects + String destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (destination != null) { + KeyType destinationHash = (KeyType) hashAlgorithm.hashCode(destination); + Integer destinationValue = pNamePageIdMap.get(destinationHash); + if (destinationValue != null) { + + pageRedirects.addRow(destinationValue, page_idValueR); + pageMapLine.addRow(page_id, page_idValueR, destinationValue, SQL_NULL, + SQL_NULL); + metaData.addRedirect(); + } + } + } } - } } - } + } - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories - outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), metaData - .getMainCategory(), metaData.getNrOfPages(), metaData - .getNrOfRedirects(), metaData.getNrOfDisambiguations(), metaData.getNrOfCategories()); - outputFile.flush(); - outputFile.close(); - } + @Override + public void writeMetaData() throws IOException + { + TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories + outputFile.addRow(metaData.getId(), metaData.getLanguage(), + metaData.getDisambiguationCategory(), metaData.getMainCategory(), + metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories()); + outputFile.flush(); + outputFile.close(); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java index 7a88504b..80a10872 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKIntKeyFactory.java @@ -21,16 +21,21 @@ import org.dkpro.jwpl.wikimachine.dump.version.IDumpVersionFactory; import org.dkpro.jwpl.wikimachine.hashing.StringHashCodeJDK; -public class SingleDumpVersionJDKIntKeyFactory implements IDumpVersionFactory { +public class SingleDumpVersionJDKIntKeyFactory + implements IDumpVersionFactory +{ - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new SingleDumpVersionJDKGeneric<Integer, StringHashCodeJDK>(StringHashCodeJDK.class); - } catch (Exception e) { - dumpVersion = null; + @Override + public IDumpVersion getDumpVersion() + { + IDumpVersion dumpVersion; + try { + dumpVersion = new SingleDumpVersionJDKGeneric<Integer, StringHashCodeJDK>( + StringHashCodeJDK.class); + } + catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; } - return dumpVersion; - } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java index 542aeba5..be5596b1 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKLongKeyFactory.java @@ -21,17 +21,22 @@ import org.dkpro.jwpl.wikimachine.dump.version.IDumpVersionFactory; import org.dkpro.jwpl.wikimachine.hashing.StringHashCodeJBoss; -public class SingleDumpVersionJDKLongKeyFactory implements IDumpVersionFactory { +public class SingleDumpVersionJDKLongKeyFactory + implements IDumpVersionFactory +{ - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new SingleDumpVersionJDKGeneric<Long, StringHashCodeJBoss>(StringHashCodeJBoss.class); - } catch (Exception e) { - dumpVersion = null; - } - return dumpVersion; - } + @Override + public IDumpVersion getDumpVersion() + { + IDumpVersion dumpVersion; + try { + dumpVersion = new SingleDumpVersionJDKGeneric<Long, StringHashCodeJBoss>( + StringHashCodeJBoss.class); + } + catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java index 0c414f7d..6187def6 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionJDKStringKeyFactory.java @@ -21,17 +21,22 @@ import org.dkpro.jwpl.wikimachine.dump.version.IDumpVersionFactory; import org.dkpro.jwpl.wikimachine.hashing.StringHashCodeDisabled; -public class SingleDumpVersionJDKStringKeyFactory implements IDumpVersionFactory { +public class SingleDumpVersionJDKStringKeyFactory + implements IDumpVersionFactory +{ - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new SingleDumpVersionJDKGeneric<String, StringHashCodeDisabled>(StringHashCodeDisabled.class); - } catch (Exception e) { - dumpVersion = null; - } - return dumpVersion; - } + @Override + public IDumpVersion getDumpVersion() + { + IDumpVersion dumpVersion; + try { + dumpVersion = new SingleDumpVersionJDKGeneric<String, StringHashCodeDisabled>( + StringHashCodeDisabled.class); + } + catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java index 51c297bf..0105f328 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/version/SingleDumpVersionOriginal.java @@ -47,350 +47,382 @@ * <p> * Adopted to IDumpVersion by Galkin */ -public class SingleDumpVersionOriginal implements IDumpVersion { - - // metadata - private String language; - private String mainCategory; - private String disambiguationsCategory; - - // statistics - private int nrOfDisambiguations = 0; - private int nrOfPages = 0; - private int nrOfCategories = 0; - private int nrOfRedirects = 0; - - private Map<Integer, String> pPageIdNameMap;// maps page id's of pages to - // their names - private Map<Integer, String> cPageIdNameMap;// maps page id's of categories - // to their names - private Map<String, Integer> pNamePageIdMap;// maps names of pages to their - // page id's. - private Map<String, Integer> cNamePageIdMap;// maps names of categories to - // their page id's. - private Map<Integer, String> rPageIdNameMap;// maps page id's of redirects - // to their names. - private IntSet disambiguations; // caches the page id's of - // disambiguation pages. - private Int2IntOpenHashMap textIdPageIdMap;// maps text id's to the page - - // id's. - - // galkin: moved from local variables to fields - private TxtFileWriter txtFW; - private TxtFileWriter pageCategories; - private TxtFileWriter categoryPages; - private TxtFileWriter categoryInlinks; - private TxtFileWriter categoryOutlinks; - private TxtFileWriter pageInlinks; - private TxtFileWriter pageOutlinks; - private TxtFileWriter page; - private TxtFileWriter pageMapLine; - private TxtFileWriter pageRedirects; - private String outputDir; - - // galkin: added - - private ILogger logger; - private boolean skipPage = true; - private boolean skipCategory = true; - - /** - * Returns the String value of the bit 1 if the given boolean is true<br> - * and an empty String otherwise. This the way bit values are written<br> - * in .txt dump files. - * - * @param b - * @return - */ - private String formatBoolean(boolean b) { - return b ? new String(new byte[]{1}) : ""; - } - - @Override - public void exportAfterCategoryLinksParsing() throws IOException { - pageCategories.export(); - categoryPages.export(); - categoryInlinks.export(); - categoryOutlinks.export(); - } - - @Override - public void exportAfterPageLinksParsing() throws IOException { - pageInlinks.export(); - pageOutlinks.export(); - } - - @Override - public void exportAfterPageParsing() throws IOException { - txtFW.export(); - - nrOfCategories = cPageIdNameMap.keySet().size(); - nrOfPages = pPageIdNameMap.keySet().size() - + rPageIdNameMap.keySet().size(); - } - - @Override - public void exportAfterRevisionParsing() throws IOException { - } - - @Override - public void exportAfterTextParsing() throws IOException { - page.export(); - pageRedirects.export(); - pageMapLine.export(); - } - - @Override - public void flushByTextParsing() throws IOException { - page.flush(); - pageRedirects.flush(); - pageMapLine.flush(); - } - - @Override - public void freeAfterCategoryLinksParsing() { - - } - - @Override - public void freeAfterPageLinksParsing() { - - } - - @Override - public void freeAfterPageParsing() { - - } - - @Override - public void freeAfterRevisonParsing() { - } - - @Override - public void freeAfterTextParsing() { - page.export(); - pageRedirects.export(); - pageMapLine.export(); - } - - @Override - public void initCategoryLinksParsing() throws IOException { - pageCategories = new TxtFileWriter(outputDir + File.separator + "page_categories.txt"); - categoryPages = new TxtFileWriter(outputDir + File.separator + "category_pages.txt"); - categoryInlinks = new TxtFileWriter(outputDir + File.separator + "category_inlinks.txt"); - categoryOutlinks = new TxtFileWriter(outputDir + File.separator + "category_outlinks.txt"); - } - - @Override - public void initPageLinksParsing() throws IOException { - pageInlinks = new TxtFileWriter(outputDir + File.separator + "page_inlinks.txt"); - pageOutlinks = new TxtFileWriter(outputDir + File.separator + "page_outlinks.txt"); - } - - @Override - public void initPageParsing() throws IOException { - txtFW = new TxtFileWriter(outputDir + File.separator + "Category.txt"); - - } - - @Override - public void initRevisionParsion() { - - } - - @Override - public void initTextParsing() throws IOException { - page = new TxtFileWriter(outputDir + File.separator + "Page.txt"); - pageMapLine = new TxtFileWriter(outputDir + File.separator + "PageMapLine.txt"); - pageRedirects = new TxtFileWriter(outputDir + File.separator + "page_redirects.txt"); - - } - - @Override - public void initialize(Timestamp timestamp) { - this.pPageIdNameMap = new HashMap<>(); - this.cPageIdNameMap = new HashMap<>(); - this.pNamePageIdMap = new HashMap<>(); - this.cNamePageIdMap = new HashMap<>(); - this.rPageIdNameMap = new HashMap<>(); - this.disambiguations = new IntArraySet(); - this.textIdPageIdMap = new Int2IntOpenHashMap(); - - } - - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { - - int cl_from; - String cl_to; - - cl_from = clParser.getClFrom(); - cl_to = clParser.getClTo(); - if (!cNamePageIdMap.containsKey(cl_to)) { - // discard links with non-registered targets - return; +public class SingleDumpVersionOriginal + implements IDumpVersion +{ + + // metadata + private String language; + private String mainCategory; + private String disambiguationsCategory; + + // statistics + private int nrOfDisambiguations = 0; + private int nrOfPages = 0; + private int nrOfCategories = 0; + private int nrOfRedirects = 0; + + private Map<Integer, String> pPageIdNameMap;// maps page id's of pages to + // their names + private Map<Integer, String> cPageIdNameMap;// maps page id's of categories + // to their names + private Map<String, Integer> pNamePageIdMap;// maps names of pages to their + // page id's. + private Map<String, Integer> cNamePageIdMap;// maps names of categories to + // their page id's. + private Map<Integer, String> rPageIdNameMap;// maps page id's of redirects + // to their names. + private IntSet disambiguations; // caches the page id's of + // disambiguation pages. + private Int2IntOpenHashMap textIdPageIdMap;// maps text id's to the page + + // id's. + + // galkin: moved from local variables to fields + private TxtFileWriter txtFW; + private TxtFileWriter pageCategories; + private TxtFileWriter categoryPages; + private TxtFileWriter categoryInlinks; + private TxtFileWriter categoryOutlinks; + private TxtFileWriter pageInlinks; + private TxtFileWriter pageOutlinks; + private TxtFileWriter page; + private TxtFileWriter pageMapLine; + private TxtFileWriter pageRedirects; + private String outputDir; + + // galkin: added + + private ILogger logger; + private boolean skipPage = true; + private boolean skipCategory = true; + + /** + * Returns the String value of the bit 1 if the given boolean is true<br> + * and an empty String otherwise. This the way bit values are written<br> + * in .txt dump files. + * + * @param b + * @return + */ + private String formatBoolean(boolean b) + { + return b ? new String(new byte[] { 1 }) : ""; } - // if the link source is a page then write the link in - // category_pages and - // page_categories - if (pPageIdNameMap.containsKey(cl_from)) { - categoryPages.addRow(cNamePageIdMap.get(cl_to), cl_from); - pageCategories.addRow(cl_from, cNamePageIdMap.get(cl_to)); - if (cl_to.equals(disambiguationsCategory)) { - disambiguations.add(cl_from); - nrOfDisambiguations++; - } - } else { - // if the link source is a category than write the link in - // category_inlinks and category_outlinks - if (cPageIdNameMap.containsKey(cl_from)) { - categoryOutlinks.addRow(cNamePageIdMap.get(cl_to), cl_from); - categoryInlinks.addRow(cl_from, cNamePageIdMap.get(cl_to)); - } + + @Override + public void exportAfterCategoryLinksParsing() throws IOException + { + pageCategories.export(); + categoryPages.export(); + categoryInlinks.export(); + categoryOutlinks.export(); + } + + @Override + public void exportAfterPageLinksParsing() throws IOException + { + pageInlinks.export(); + pageOutlinks.export(); + } + + @Override + public void exportAfterPageParsing() throws IOException + { + txtFW.export(); + + nrOfCategories = cPageIdNameMap.keySet().size(); + nrOfPages = pPageIdNameMap.keySet().size() + rPageIdNameMap.keySet().size(); + } + + @Override + public void exportAfterRevisionParsing() throws IOException + { + } + + @Override + public void exportAfterTextParsing() throws IOException + { + page.export(); + pageRedirects.export(); + pageMapLine.export(); + } + + @Override + public void flushByTextParsing() throws IOException + { + page.flush(); + pageRedirects.flush(); + pageMapLine.flush(); + } + + @Override + public void freeAfterCategoryLinksParsing() + { + + } + + @Override + public void freeAfterPageLinksParsing() + { + + } + + @Override + public void freeAfterPageParsing() + { + + } + + @Override + public void freeAfterRevisonParsing() + { + } + + @Override + public void freeAfterTextParsing() + { + page.export(); + pageRedirects.export(); + pageMapLine.export(); + } + + @Override + public void initCategoryLinksParsing() throws IOException + { + pageCategories = new TxtFileWriter(outputDir + File.separator + "page_categories.txt"); + categoryPages = new TxtFileWriter(outputDir + File.separator + "category_pages.txt"); + categoryInlinks = new TxtFileWriter(outputDir + File.separator + "category_inlinks.txt"); + categoryOutlinks = new TxtFileWriter(outputDir + File.separator + "category_outlinks.txt"); } - } + @Override + public void initPageLinksParsing() throws IOException + { + pageInlinks = new TxtFileWriter(outputDir + File.separator + "page_inlinks.txt"); + pageOutlinks = new TxtFileWriter(outputDir + File.separator + "page_outlinks.txt"); + } - @Override - public void processPageLinksRow(PagelinksParser plParser) throws IOException { + @Override + public void initPageParsing() throws IOException + { + txtFW = new TxtFileWriter(outputDir + File.separator + "Category.txt"); - int pl_from; - String pl_to; + } - pl_from = plParser.getPlFrom(); - pl_to = plParser.getPlTo(); - // skip redirects or page with other namespace than 0 + @Override + public void initRevisionParsion() + { - if (skipPage && !pPageIdNameMap.containsKey(pl_from) || !pNamePageIdMap.containsKey(pl_to)) { - return; } - pageOutlinks.addRow(pl_from, pNamePageIdMap.get(pl_to)); - pageInlinks.addRow(pNamePageIdMap.get(pl_to), pl_from); - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - - int page_id; - int page_namespace; - String page_title; - - page_namespace = pageParser.getPageNamespace(); - // handle categories - if (page_namespace == 14) { - if (skipCategory) { - if (pageParser.getPageIsRedirect()) - // skip categories that are redirects - return; - } - // retrieve page id and page title - page_id = pageParser.getPageId(); - page_title = pageParser.getPageTitle(); - if (page_title.equals(disambiguationsCategory)) { - logger.log("Disambiguations Category found: " + page_title); - } - if (page_title.equals(mainCategory)) { - logger.log("Main Category found: " + page_title); - } - // cache the retrieved values - cPageIdNameMap.put(page_id, page_title); - cNamePageIdMap.put(page_title, page_id); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - return; + @Override + public void initTextParsing() throws IOException + { + page = new TxtFileWriter(outputDir + File.separator + "Page.txt"); + pageMapLine = new TxtFileWriter(outputDir + File.separator + "PageMapLine.txt"); + pageRedirects = new TxtFileWriter(outputDir + File.separator + "page_redirects.txt"); + } - // handle pages - if (page_namespace == 0) { - // retrieve page id and title - page_id = pageParser.getPageId(); - page_title = pageParser.getPageTitle(); - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - rPageIdNameMap.put(page_id, page_title); - } else { - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_title, page_id); - } + + @Override + public void initialize(Timestamp timestamp) + { + this.pPageIdNameMap = new HashMap<>(); + this.cPageIdNameMap = new HashMap<>(); + this.pNamePageIdMap = new HashMap<>(); + this.cNamePageIdMap = new HashMap<>(); + this.rPageIdNameMap = new HashMap<>(); + this.disambiguations = new IntArraySet(); + this.textIdPageIdMap = new Int2IntOpenHashMap(); + + } + + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException + { + + int cl_from; + String cl_to; + + cl_from = clParser.getClFrom(); + cl_to = clParser.getClTo(); + if (!cNamePageIdMap.containsKey(cl_to)) { + // discard links with non-registered targets + return; + } + // if the link source is a page then write the link in + // category_pages and + // page_categories + if (pPageIdNameMap.containsKey(cl_from)) { + categoryPages.addRow(cNamePageIdMap.get(cl_to), cl_from); + pageCategories.addRow(cl_from, cNamePageIdMap.get(cl_to)); + if (cl_to.equals(disambiguationsCategory)) { + disambiguations.add(cl_from); + nrOfDisambiguations++; + } + } + else { + // if the link source is a category than write the link in + // category_inlinks and category_outlinks + if (cPageIdNameMap.containsKey(cl_from)) { + categoryOutlinks.addRow(cNamePageIdMap.get(cl_to), cl_from); + categoryInlinks.addRow(cl_from, cNamePageIdMap.get(cl_to)); + } + } + } - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser - .getRevPage()); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - - String destination; - int text_id; - int page_id; - - text_id = textParser.getOldId(); - if (!textIdPageIdMap.containsKey(text_id)) - return; - page_id = textIdPageIdMap.get(text_id); - if (pPageIdNameMap.containsKey(page_id)) {// pages - page.addRow(page_id, page_id, pPageIdNameMap.get(page_id), - textParser.getOldText(), formatBoolean(disambiguations.contains(page_id))); - pageMapLine.addRow(page_id, pPageIdNameMap.get(page_id), page_id, "NULL", "NULL"); - return; + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException + { + + int pl_from; + String pl_to; + + pl_from = plParser.getPlFrom(); + pl_to = plParser.getPlTo(); + // skip redirects or page with other namespace than 0 + + if (skipPage && !pPageIdNameMap.containsKey(pl_from) + || !pNamePageIdMap.containsKey(pl_to)) { + return; + } + + pageOutlinks.addRow(pl_from, pNamePageIdMap.get(pl_to)); + pageInlinks.addRow(pNamePageIdMap.get(pl_to), pl_from); } - if (rPageIdNameMap.containsKey(page_id)) {// Redirects - destination = Redirects.getRedirectDestination(textParser.getOldText()); - if (!pNamePageIdMap.containsKey(destination)) - return; - pageRedirects.addRow(pNamePageIdMap.get(destination), rPageIdNameMap.get(page_id)); - pageMapLine.addRow(page_id, rPageIdNameMap.get(page_id), - pNamePageIdMap.get(destination), "NULL", "NULL"); - nrOfRedirects++; + + @Override + public void processPageRow(PageParser pageParser) throws IOException + { + + int page_id; + int page_namespace; + String page_title; + + page_namespace = pageParser.getPageNamespace(); + // handle categories + if (page_namespace == 14) { + if (skipCategory) { + if (pageParser.getPageIsRedirect()) + // skip categories that are redirects + return; + } + // retrieve page id and page title + page_id = pageParser.getPageId(); + page_title = pageParser.getPageTitle(); + if (page_title.equals(disambiguationsCategory)) { + logger.log("Disambiguations Category found: " + page_title); + } + if (page_title.equals(mainCategory)) { + logger.log("Main Category found: " + page_title); + } + // cache the retrieved values + cPageIdNameMap.put(page_id, page_title); + cNamePageIdMap.put(page_title, page_id); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + return; + } + // handle pages + if (page_namespace == 0) { + // retrieve page id and title + page_id = pageParser.getPageId(); + page_title = pageParser.getPageTitle(); + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + rPageIdNameMap.put(page_id, page_title); + } + else { + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_title, page_id); + } + } + } - } - - @Override - public void setFiles(Files versionFiles) { - // galkin: only output directory will be used, other file names will be - // taken from original source code - outputDir = versionFiles.getOutputDirectory().getAbsolutePath(); - } - - @Override - public void setLogger(ILogger logger) { - this.logger = logger; - } - - @Override - public void setMetaData(MetaData commonMetaData) { - this.language = commonMetaData.getLanguage(); - this.mainCategory = commonMetaData.getMainCategory(); - this.disambiguationsCategory = commonMetaData - .getDisambiguationCategory(); - } - - @Override - public void writeMetaData() throws IOException { - try(TxtFileWriter metaData = new TxtFileWriter(outputDir + File.separator + "MetaData.txt")) { - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories - metaData.addRow("null", language, disambiguationsCategory, - mainCategory, nrOfPages, nrOfRedirects, nrOfDisambiguations, nrOfCategories); - metaData.export(); + @Override + public void processRevisionRow(RevisionParser revisionParser) + { + textIdPageIdMap.put(revisionParser.getRevTextId(), revisionParser.getRevPage()); } - } - @Override - public void setCategoryRedirectsSkip(boolean skipCategory) { - this.skipCategory = skipCategory; - } + @Override + public void processTextRow(TextParser textParser) throws IOException + { + + String destination; + int text_id; + int page_id; + + text_id = textParser.getOldId(); + if (!textIdPageIdMap.containsKey(text_id)) + return; + page_id = textIdPageIdMap.get(text_id); + if (pPageIdNameMap.containsKey(page_id)) {// pages + page.addRow(page_id, page_id, pPageIdNameMap.get(page_id), textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, pPageIdNameMap.get(page_id), page_id, "NULL", "NULL"); + return; + } + if (rPageIdNameMap.containsKey(page_id)) {// Redirects + destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (!pNamePageIdMap.containsKey(destination)) + return; + pageRedirects.addRow(pNamePageIdMap.get(destination), rPageIdNameMap.get(page_id)); + pageMapLine.addRow(page_id, rPageIdNameMap.get(page_id), + pNamePageIdMap.get(destination), "NULL", "NULL"); + nrOfRedirects++; + } - @Override - public void setPageRedirectsSkip(boolean skipPage) { - this.skipPage = skipPage; - } + } + + @Override + public void setFiles(Files versionFiles) + { + // galkin: only output directory will be used, other file names will be + // taken from original source code + outputDir = versionFiles.getOutputDirectory().getAbsolutePath(); + } + + @Override + public void setLogger(ILogger logger) + { + this.logger = logger; + } + + @Override + public void setMetaData(MetaData commonMetaData) + { + this.language = commonMetaData.getLanguage(); + this.mainCategory = commonMetaData.getMainCategory(); + this.disambiguationsCategory = commonMetaData.getDisambiguationCategory(); + } + + @Override + public void writeMetaData() throws IOException + { + try (TxtFileWriter metaData = new TxtFileWriter( + outputDir + File.separator + "MetaData.txt")) { + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories + metaData.addRow("null", language, disambiguationsCategory, mainCategory, nrOfPages, + nrOfRedirects, nrOfDisambiguations, nrOfCategories); + metaData.export(); + } + } + + @Override + public void setCategoryRedirectsSkip(boolean skipCategory) + { + this.skipCategory = skipCategory; + } + + @Override + public void setPageRedirectsSkip(boolean skipPage) + { + this.skipPage = skipPage; + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java index 9d356e80..1f3be07b 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/BinaryDumpTableInputStream.java @@ -23,19 +23,23 @@ import org.dkpro.jwpl.wikimachine.dump.xml.DumpTableEnum; import org.dkpro.jwpl.wikimachine.dump.xml.DumpTableInputStream; -public class BinaryDumpTableInputStream extends DumpTableInputStream { +public class BinaryDumpTableInputStream + extends DumpTableInputStream +{ - protected InputStream inputStream = null; + protected InputStream inputStream = null; - @Override - public void initialize(InputStream inputStream, DumpTableEnum table) throws IOException { - // just read from the stream without any data manipulations - this.inputStream = inputStream; - } + @Override + public void initialize(InputStream inputStream, DumpTableEnum table) throws IOException + { + // just read from the stream without any data manipulations + this.inputStream = inputStream; + } - @Override - public int read() throws IOException { - return inputStream.read(); - } + @Override + public int read() throws IOException + { + return inputStream.read(); + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java index 8132b13e..78f8cd6a 100755 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/DataMachineRevisionParser.java @@ -22,17 +22,21 @@ import org.dkpro.jwpl.wikimachine.dump.xml.RevisionParser; -public class DataMachineRevisionParser extends RevisionParser { +public class DataMachineRevisionParser + extends RevisionParser +{ - @Override - public boolean next() throws IOException { - boolean hasNext = true; - try { - revPage = stream.readInt(); - revTextId = stream.readInt(); - } catch (EOFException e) { - hasNext = false; + @Override + public boolean next() throws IOException + { + boolean hasNext = true; + try { + revPage = stream.readInt(); + revTextId = stream.readInt(); + } + catch (EOFException e) { + hasNext = false; + } + return hasNext; } - return hasNext; - } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java index 50453b86..35e3141c 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleBinaryDumpWriter.java @@ -35,113 +35,136 @@ import org.dkpro.jwpl.wikimachine.util.Redirects; import org.dkpro.jwpl.wikimachine.util.UTFDataOutputStream; -public class SimpleBinaryDumpWriter implements DumpWriter { - - private UTFDataOutputStream pageFile; - private UTFDataOutputStream revisionFile; - private UTFDataOutputStream textFile; - private final DataMachineFiles files; - - private Page currentPage; - private Revision lastRevision; - - public SimpleBinaryDumpWriter(DataMachineFiles files) throws IOException { - this.files = files; - if (this.files.isCompressGeneratedFiles()) { - createCompressed(); - } else { - createUncompressed(); +public class SimpleBinaryDumpWriter + implements DumpWriter +{ + + private UTFDataOutputStream pageFile; + private UTFDataOutputStream revisionFile; + private UTFDataOutputStream textFile; + private final DataMachineFiles files; + + private Page currentPage; + private Revision lastRevision; + + public SimpleBinaryDumpWriter(DataMachineFiles files) throws IOException + { + this.files = files; + if (this.files.isCompressGeneratedFiles()) { + createCompressed(); + } + else { + createUncompressed(); + } } - } - - protected void createUncompressed() throws IOException { - pageFile = openUTFDataOutputStream(files.getGeneratedPage(), false); - revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), false); - textFile = openUTFDataOutputStream(files.getGeneratedText(), false); - } - - protected void createCompressed() throws IOException { - pageFile = openUTFDataOutputStream(files.getGeneratedPage(), true); - revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), true); - textFile = openUTFDataOutputStream(files.getGeneratedText(), true); - } - - private UTFDataOutputStream openUTFDataOutputStream(final String filePath, final boolean compressed) throws IOException { - UTFDataOutputStream utfDataOutputStream; - if (compressed) { - utfDataOutputStream = new UTFDataOutputStream(new GZIPOutputStream(openFileStreamAndRegisterDeletion(filePath))); - } else { - utfDataOutputStream = new UTFDataOutputStream(openFileStreamAndRegisterDeletion(filePath)); + + protected void createUncompressed() throws IOException + { + pageFile = openUTFDataOutputStream(files.getGeneratedPage(), false); + revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), false); + textFile = openUTFDataOutputStream(files.getGeneratedText(), false); + } + + protected void createCompressed() throws IOException + { + pageFile = openUTFDataOutputStream(files.getGeneratedPage(), true); + revisionFile = openUTFDataOutputStream(files.getGeneratedRevision(), true); + textFile = openUTFDataOutputStream(files.getGeneratedText(), true); + } + + private UTFDataOutputStream openUTFDataOutputStream(final String filePath, + final boolean compressed) + throws IOException + { + UTFDataOutputStream utfDataOutputStream; + if (compressed) { + utfDataOutputStream = new UTFDataOutputStream( + new GZIPOutputStream(openFileStreamAndRegisterDeletion(filePath))); + } + else { + utfDataOutputStream = new UTFDataOutputStream( + openFileStreamAndRegisterDeletion(filePath)); + } + return utfDataOutputStream; + } + + private BufferedOutputStream openFileStreamAndRegisterDeletion(final String filePath) + throws IOException + { + Path binaryOutputFilePath = Paths.get(filePath); + // JavaDoc says: + // "truncate and overwrite an existing file, or create the file if it doesn't initially + // exist" + OutputStream fileOutputStream = Files.newOutputStream(binaryOutputFilePath); + + // Register a delete hook on JVM shutdown for this path + DeleteFilesAtShutdown.register(binaryOutputFilePath); + + // Create a buffered version for this + return new BufferedOutputStream(fileOutputStream); + } + + @Override + public void close() throws IOException + { + pageFile.close(); + revisionFile.close(); + textFile.close(); + } + + @Override + public void writeEndPage() throws IOException + { + if (lastRevision != null) { + updatePage(currentPage, lastRevision); + } + currentPage = null; + lastRevision = null; } - return utfDataOutputStream; - } - - private BufferedOutputStream openFileStreamAndRegisterDeletion(final String filePath) throws IOException { - Path binaryOutputFilePath = Paths.get(filePath); - // JavaDoc says: - // "truncate and overwrite an existing file, or create the file if it doesn't initially exist" - OutputStream fileOutputStream = Files.newOutputStream(binaryOutputFilePath); - - // Register a delete hook on JVM shutdown for this path - DeleteFilesAtShutdown.register(binaryOutputFilePath); - - // Create a buffered version for this - return new BufferedOutputStream(fileOutputStream); - } - - @Override - public void close() throws IOException { - pageFile.close(); - revisionFile.close(); - textFile.close(); - } - - @Override - public void writeEndPage() throws IOException { - if (lastRevision != null) { - updatePage(currentPage, lastRevision); + + @Override + public void writeEndWiki() throws IOException + { + pageFile.flush(); + revisionFile.flush(); + textFile.flush(); + } + + @Override + public void writeRevision(Revision revision) throws IOException + { + lastRevision = revision; + + revisionFile.writeInt(currentPage.Id); + revisionFile.writeInt(revision.Id); + + textFile.writeInt(revision.Id); + textFile.writeUTFAsArray(SQLEscape.escape(revision.Text)); + } + + @Override + public void writeSiteinfo(Siteinfo info) throws IOException + { + } + + @Override + public void writeStartPage(Page page) throws IOException + { + currentPage = page; + lastRevision = null; + } + + @Override + public void writeStartWiki() throws IOException + { + } + + private void updatePage(Page page, Revision revision) throws IOException + { + pageFile.writeInt(page.Id); + pageFile.writeInt(page.Title.Namespace); + pageFile.writeUTFAsArray(SQLEscape.escape(SQLEscape.titleFormat(page.Title.Text))); + // pageFile.writeBoolean(revision.isRedirect()); + pageFile.writeBoolean(Redirects.isRedirect(revision.Text)); } - currentPage = null; - lastRevision = null; - } - - @Override - public void writeEndWiki() throws IOException { - pageFile.flush(); - revisionFile.flush(); - textFile.flush(); - } - - @Override - public void writeRevision(Revision revision) throws IOException { - lastRevision = revision; - - revisionFile.writeInt(currentPage.Id); - revisionFile.writeInt(revision.Id); - - textFile.writeInt(revision.Id); - textFile.writeUTFAsArray(SQLEscape.escape(revision.Text)); - } - - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { - } - - @Override - public void writeStartPage(Page page) throws IOException { - currentPage = page; - lastRevision = null; - } - - @Override - public void writeStartWiki() throws IOException { - } - - private void updatePage(Page page, Revision revision) throws IOException { - pageFile.writeInt(page.Id); - pageFile.writeInt(page.Title.Namespace); - pageFile.writeUTFAsArray(SQLEscape.escape(SQLEscape.titleFormat(page.Title.Text))); - // pageFile.writeBoolean(revision.isRedirect()); - pageFile.writeBoolean(Redirects.isRedirect(revision.Text)); - } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java index 15eba9a7..c4f42383 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/SimpleXmlDumpReader.java @@ -30,37 +30,42 @@ import org.dkpro.jwpl.wikimachine.dump.xml.AbstractXmlDumpReader; /** - * This class is a specified variant of XmlDumpReader. Please see its source for more - * information about a functionality and a license.<br> + * This class is a specified variant of XmlDumpReader. Please see its source for more information + * about a functionality and a license.<br> */ -public class SimpleXmlDumpReader extends AbstractXmlDumpReader { +public class SimpleXmlDumpReader + extends AbstractXmlDumpReader +{ - public SimpleXmlDumpReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); + public SimpleXmlDumpReader(InputStream inputStream, DumpWriter writer) + { + super(inputStream, writer); - } + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() + { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); + @Override + protected void setupEndElements() + { + endElements.put(REVISION, REVISION); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); - } + } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java index f683d314..c55fb9b7 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/dump/xml/XML2Binary.java @@ -25,28 +25,33 @@ import org.dkpro.jwpl.mwdumper.importer.XmlDumpReader; /** - * Use org.mediawiki.importer engine to parse the XML-Dump (only useful fields) - * and store it to binary file. Compression of the output files is possible. + * Use org.mediawiki.importer engine to parse the XML-Dump (only useful fields) and store it to + * binary file. Compression of the output files is possible. */ -public class XML2Binary { - - /* - * Enable the main and category pages as well as discussions - */ - private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; +public class XML2Binary +{ - private static final boolean USE_MODIFIED_PARSER = true; + /* + * Enable the main and category pages as well as discussions + */ + private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; - public XML2Binary(InputStream iStream, DataMachineFiles files) throws IOException { - if (USE_MODIFIED_PARSER) { - // modified parser, skips faulty tags - new SimpleXmlDumpReader(iStream, new NamespaceFilter( - new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump(); - } else { - // original MWDumper parser, very sensible to not closed tags - new XmlDumpReader(iStream, new NamespaceFilter( - new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)).readDump(); + private static final boolean USE_MODIFIED_PARSER = true; + + public XML2Binary(InputStream iStream, DataMachineFiles files) throws IOException + { + if (USE_MODIFIED_PARSER) { + // modified parser, skips faulty tags + new SimpleXmlDumpReader(iStream, + new NamespaceFilter(new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)) + .readDump(); + } + else { + // original MWDumper parser, very sensible to not closed tags + new XmlDumpReader(iStream, + new NamespaceFilter(new SimpleBinaryDumpWriter(files), ENABLED_NAMESPACES)) + .readDump(); + } } - } } diff --git a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java index 043c3ba9..efe2aeca 100644 --- a/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java +++ b/dkpro-jwpl-datamachine/src/main/java/org/dkpro/jwpl/datamachine/file/DeleteFilesAtShutdown.java @@ -27,47 +27,54 @@ import java.util.Set; /** - * A file deletion "watch dog" that can be to remove files via its {@link Path} references. It will clean out files - * upon JVM shutdown: guaranteed! + * A file deletion "watch dog" that can be to remove files via its {@link Path} references. It will + * clean out files upon JVM shutdown: guaranteed! * <p> * Inspired by and adapted from the answer here: * <a href="https://stackoverflow.com/a/42389029">https://stackoverflow.com/a/42389029</a> */ -public final class DeleteFilesAtShutdown { - private static Set<Path> paths = new LinkedHashSet<>(); +public final class DeleteFilesAtShutdown +{ + private static Set<Path> paths = new LinkedHashSet<>(); - static { - // registers the call of 'shutdownHook' at JVM shutdown - Runtime.getRuntime().addShutdownHook(new Thread(DeleteFilesAtShutdown::cleanupRegisteredFiles)); - } - - private static void cleanupRegisteredFiles() { - Set<Path> local; - synchronized (DeleteFilesAtShutdown.class) { - local = paths; - paths = null; + static { + // registers the call of 'shutdownHook' at JVM shutdown + Runtime.getRuntime() + .addShutdownHook(new Thread(DeleteFilesAtShutdown::cleanupRegisteredFiles)); } - List<Path> toBeDeleted = new ArrayList<>(local); - Collections.reverse(toBeDeleted); - for (Path p : toBeDeleted) { - try { - Files.delete(p); - } catch (IOException | RuntimeException e) { - // do nothing - best-effort - } + private static void cleanupRegisteredFiles() + { + Set<Path> local; + synchronized (DeleteFilesAtShutdown.class) { + local = paths; + paths = null; + } + + List<Path> toBeDeleted = new ArrayList<>(local); + Collections.reverse(toBeDeleted); + for (Path p : toBeDeleted) { + try { + Files.delete(p); + } + catch (IOException | RuntimeException e) { + // do nothing - best-effort + } + } } - } - /** - * Registers a {@link Path} to be removed at JVM shutdown. - * - * @param filePath A valid path pointing to a file. - */ - public static synchronized void register(Path filePath) { - if (paths == null) { - throw new IllegalStateException("Shutdown hook is already in progress. Adding paths is not allowed now!"); + /** + * Registers a {@link Path} to be removed at JVM shutdown. + * + * @param filePath + * A valid path pointing to a file. + */ + public static synchronized void register(Path filePath) + { + if (paths == null) { + throw new IllegalStateException( + "Shutdown hook is already in progress. Adding paths is not allowed now!"); + } + paths.add(filePath); } - paths.add(filePath); - } } \ No newline at end of file From 89beca7b30e846c620469f00cef16befa9b0f0a8 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho <richard.eckart@gmail.com> Date: Tue, 31 Oct 2023 14:26:19 +0100 Subject: [PATCH 07/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-mwdumper --- .../dkpro/jwpl/mwdumper/dumper/Dumper.java | 394 +++++---- .../jwpl/mwdumper/dumper/ProgressFilter.java | 106 +-- .../org/dkpro/jwpl/mwdumper/dumper/Tools.java | 92 +- .../importer/AfterTimeStampFilter.java | 20 +- .../importer/BeforeTimeStampFilter.java | 20 +- .../dkpro/jwpl/mwdumper/importer/Buffer.java | 59 +- .../jwpl/mwdumper/importer/Contributor.java | 25 +- .../jwpl/mwdumper/importer/DumpWriter.java | 19 +- .../mwdumper/importer/ExactListFilter.java | 18 +- .../jwpl/mwdumper/importer/LatestFilter.java | 70 +- .../jwpl/mwdumper/importer/ListFilter.java | 53 +- .../jwpl/mwdumper/importer/MultiWriter.java | 97 ++- .../mwdumper/importer/NamespaceFilter.java | 76 +- .../jwpl/mwdumper/importer/NamespaceSet.java | 73 +- .../jwpl/mwdumper/importer/NotalkFilter.java | 18 +- .../dkpro/jwpl/mwdumper/importer/Page.java | 22 +- .../jwpl/mwdumper/importer/PageFilter.java | 79 +- .../jwpl/mwdumper/importer/Revision.java | 35 +- .../mwdumper/importer/RevisionListFilter.java | 105 ++- .../jwpl/mwdumper/importer/Siteinfo.java | 13 +- .../jwpl/mwdumper/importer/SphinxWriter.java | 110 +-- .../jwpl/mwdumper/importer/SqlFileStream.java | 38 +- .../jwpl/mwdumper/importer/SqlLiteral.java | 19 +- .../mwdumper/importer/SqlServerStream.java | 59 +- .../jwpl/mwdumper/importer/SqlStream.java | 9 +- .../jwpl/mwdumper/importer/SqlWriter.java | 625 +++++++------- .../jwpl/mwdumper/importer/SqlWriter14.java | 126 +-- .../jwpl/mwdumper/importer/SqlWriter15.java | 202 ++--- .../mwdumper/importer/TimeStampFilter.java | 82 +- .../dkpro/jwpl/mwdumper/importer/Title.java | 138 +-- .../mwdumper/importer/TitleMatchFilter.java | 22 +- .../jwpl/mwdumper/importer/XmlDumpReader.java | 804 ++++++++++-------- .../jwpl/mwdumper/importer/XmlDumpWriter.java | 233 ++--- .../jwpl/mwdumper/importer/XmlWriter.java | 381 +++++---- .../jwpl/mwdumper/importer/TitleTest.java | 363 ++++---- 35 files changed, 2489 insertions(+), 2116 deletions(-) diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java index 3fd7b418..c932b064 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Dumper.java @@ -88,216 +88,246 @@ import org.dkpro.jwpl.mwdumper.importer.XmlDumpReader; import org.dkpro.jwpl.mwdumper.importer.XmlDumpWriter; -class Dumper { - public static void main(String[] args) throws IOException, ParseException { - InputStream input = null; - OutputWrapper output = null; - DumpWriter sink = null; - MultiWriter writers = new MultiWriter(); - int progressInterval = 1000; +class Dumper +{ + public static void main(String[] args) throws IOException, ParseException + { + InputStream input = null; + OutputWrapper output = null; + DumpWriter sink = null; + MultiWriter writers = new MultiWriter(); + int progressInterval = 1000; - for (int i = 0; i < args.length; i++) { - String arg = args[i]; - String[] bits = splitArg(arg); - if (bits != null) { - String opt = bits[0], val = bits[1], param = bits[2]; - if (opt.equals("output")) { - if (output != null) { - // Finish constructing the previous output... - if (sink == null) - sink = new XmlDumpWriter(output.getFileStream()); - writers.add(sink); - sink = null; - } - output = openOutputFile(val, param); - } else if (opt.equals("format")) { - if (output == null) + for (int i = 0; i < args.length; i++) { + String arg = args[i]; + String[] bits = splitArg(arg); + if (bits != null) { + String opt = bits[0], val = bits[1], param = bits[2]; + if (opt.equals("output")) { + if (output != null) { + // Finish constructing the previous output... + if (sink == null) + sink = new XmlDumpWriter(output.getFileStream()); + writers.add(sink); + sink = null; + } + output = openOutputFile(val, param); + } + else if (opt.equals("format")) { + if (output == null) + output = new OutputWrapper(Tools.openStandardOutput()); + if (sink != null) + throw new IllegalArgumentException("Only one format per output allowed."); + sink = openOutputSink(output, val, param); + } + else if (opt.equals("filter")) { + if (sink == null) { + if (output == null) + output = new OutputWrapper(Tools.openStandardOutput()); + sink = new XmlDumpWriter(output.getFileStream()); + } + sink = addFilter(sink, val, param); + } + else if (opt.equals("progress")) { + progressInterval = Integer.parseInt(val); + } + else if (opt.equals("quiet")) { + progressInterval = 0; + } + else { + throw new IllegalArgumentException("Unrecognized option " + opt); + } + } + else if (arg.equals("-")) { + if (input != null) + throw new IllegalArgumentException("Input already set; can't set to stdin"); + input = Tools.openStandardInput(); + } + else { + if (input != null) + throw new IllegalArgumentException("Input already set; can't set to " + arg); + input = Tools.openInputFile(arg); + } + } + + if (input == null) + input = Tools.openStandardInput(); + if (output == null) output = new OutputWrapper(Tools.openStandardOutput()); - if (sink != null) - throw new IllegalArgumentException("Only one format per output allowed."); - sink = openOutputSink(output, val, param); - } else if (opt.equals("filter")) { - if (sink == null) { - if (output == null) - output = new OutputWrapper(Tools.openStandardOutput()); + // Finish stacking the last output sink + if (sink == null) sink = new XmlDumpWriter(output.getFileStream()); - } - sink = addFilter(sink, val, param); - } else if (opt.equals("progress")) { - progressInterval = Integer.parseInt(val); - } else if (opt.equals("quiet")) { - progressInterval = 0; - } else { - throw new IllegalArgumentException("Unrecognized option " + opt); - } - } else if (arg.equals("-")) { - if (input != null) - throw new IllegalArgumentException("Input already set; can't set to stdin"); - input = Tools.openStandardInput(); - } else { - if (input != null) - throw new IllegalArgumentException("Input already set; can't set to " + arg); - input = Tools.openInputFile(arg); - } - } + writers.add(sink); - if (input == null) - input = Tools.openStandardInput(); - if (output == null) - output = new OutputWrapper(Tools.openStandardOutput()); - // Finish stacking the last output sink - if (sink == null) - sink = new XmlDumpWriter(output.getFileStream()); - writers.add(sink); + DumpWriter outputSink = (progressInterval > 0) + ? new ProgressFilter(writers, progressInterval) + : writers; - DumpWriter outputSink = (progressInterval > 0) ? new ProgressFilter(writers, progressInterval) : writers; + XmlDumpReader reader = new XmlDumpReader(input, outputSink); + reader.readDump(); + } - XmlDumpReader reader = new XmlDumpReader(input, outputSink); - reader.readDump(); - } + /** + * @param arg + * string in format "--option=value:parameter" + * @return array of option, value, and parameter, or null if no match + */ + static String[] splitArg(String arg) + { + if (!arg.startsWith("--")) + return null; - /** - * @param arg string in format "--option=value:parameter" - * @return array of option, value, and parameter, or null if no match - */ - static String[] splitArg(String arg) { - if (!arg.startsWith("--")) - return null; + String opt; + String val = ""; + String param = ""; - String opt; - String val = ""; - String param = ""; + String[] bits = arg.substring(2).split("=", 2); + opt = bits[0]; - String[] bits = arg.substring(2).split("=", 2); - opt = bits[0]; + if (bits.length > 1) { + String[] bits2 = bits[1].split(":", 2); + val = bits2[0]; + if (bits2.length > 1) + param = bits2[1]; + } - if (bits.length > 1) { - String[] bits2 = bits[1].split(":", 2); - val = bits2[0]; - if (bits2.length > 1) - param = bits2[1]; + return new String[] { opt, val, param }; } - return new String[]{opt, val, param}; - } + // ---------------- - // ---------------- + static class OutputWrapper + { + private OutputStream fileStream = null; + private Connection sqlConnection = null; - static class OutputWrapper { - private OutputStream fileStream = null; - private Connection sqlConnection = null; + OutputWrapper(OutputStream aFileStream) + { + fileStream = aFileStream; + } - OutputWrapper(OutputStream aFileStream) { - fileStream = aFileStream; - } + OutputWrapper(Connection anSqlConnection) + { + sqlConnection = anSqlConnection; + } - OutputWrapper(Connection anSqlConnection) { - sqlConnection = anSqlConnection; - } + OutputStream getFileStream() + { + if (fileStream != null) + return fileStream; + if (sqlConnection != null) + throw new IllegalArgumentException("Expected file stream, got SQL connection?"); + throw new IllegalArgumentException( + "Have neither file nor SQL connection. Very confused!"); + } - OutputStream getFileStream() { - if (fileStream != null) - return fileStream; - if (sqlConnection != null) - throw new IllegalArgumentException("Expected file stream, got SQL connection?"); - throw new IllegalArgumentException("Have neither file nor SQL connection. Very confused!"); + SqlStream getSqlStream() throws IOException + { + if (fileStream != null) + return new SqlFileStream(fileStream); + if (sqlConnection != null) + return new SqlServerStream(sqlConnection); + throw new IllegalArgumentException( + "Have neither file nor SQL connection. Very confused!"); + } } - SqlStream getSqlStream() throws IOException { - if (fileStream != null) - return new SqlFileStream(fileStream); - if (sqlConnection != null) - return new SqlServerStream(sqlConnection); - throw new IllegalArgumentException("Have neither file nor SQL connection. Very confused!"); + static OutputWrapper openOutputFile(String dest, String param) throws IOException + { + if (dest.equals("stdout")) + return new OutputWrapper(Tools.openStandardOutput()); + else if (dest.equals("file")) + return new OutputWrapper(Tools.createOutputFile(param)); + else if (dest.equals("gzip")) + return new OutputWrapper(new GZIPOutputStream(Tools.createOutputFile(param))); + else if (dest.equals("bzip2")) + return new OutputWrapper(Tools.createBZip2File(param)); + else if (dest.equals("mysql")) + return connectMySql(param); + else if (dest.equals("postgresql")) + return connectPostgres(param); + else + throw new IllegalArgumentException("Destination sink not implemented: " + dest); } - } - - static OutputWrapper openOutputFile(String dest, String param) throws IOException { - if (dest.equals("stdout")) - return new OutputWrapper(Tools.openStandardOutput()); - else if (dest.equals("file")) - return new OutputWrapper(Tools.createOutputFile(param)); - else if (dest.equals("gzip")) - return new OutputWrapper(new GZIPOutputStream(Tools.createOutputFile(param))); - else if (dest.equals("bzip2")) - return new OutputWrapper(Tools.createBZip2File(param)); - else if (dest.equals("mysql")) - return connectMySql(param); - else if (dest.equals("postgresql")) - return connectPostgres(param); - else - throw new IllegalArgumentException("Destination sink not implemented: " + dest); - } - private static OutputWrapper connectMySql(String param) throws IOException { - try { - Class.forName("com.mysql.jdbc.Driver"); - Connection conn = DriverManager.getConnection("jdbc:mysql:" + param); - return new OutputWrapper(conn); - } catch (Exception e) { - throw (IOException) new IOException(e.getMessage()).initCause(e); + private static OutputWrapper connectMySql(String param) throws IOException + { + try { + Class.forName("com.mysql.jdbc.Driver"); + Connection conn = DriverManager.getConnection("jdbc:mysql:" + param); + return new OutputWrapper(conn); + } + catch (Exception e) { + throw (IOException) new IOException(e.getMessage()).initCause(e); + } } - } - private static OutputWrapper connectPostgres(String param) throws IOException { - try { - Class.forName("org.postgresql.Driver"); - Connection conn = DriverManager.getConnection("jdbc:postgresql:" + param); - return new OutputWrapper(conn); - } catch (Exception e) { - throw new IOException(e.toString()); + private static OutputWrapper connectPostgres(String param) throws IOException + { + try { + Class.forName("org.postgresql.Driver"); + Connection conn = DriverManager.getConnection("jdbc:postgresql:" + param); + return new OutputWrapper(conn); + } + catch (Exception e) { + throw new IOException(e.toString()); + } } - } - static DumpWriter openOutputSink(OutputWrapper output, String format, String param) throws IOException { - if (format.equals("xml")) - return new XmlDumpWriter(output.getFileStream()); - else if (format.equals("sphinx")) - return new SphinxWriter(output.getFileStream()); - else if (format.equals("mysql") || format.equals("pgsql") || format.equals("sql")) { - SqlStream sqlStream = output.getSqlStream(); - SqlWriter ret; + static DumpWriter openOutputSink(OutputWrapper output, String format, String param) + throws IOException + { + if (format.equals("xml")) + return new XmlDumpWriter(output.getFileStream()); + else if (format.equals("sphinx")) + return new SphinxWriter(output.getFileStream()); + else if (format.equals("mysql") || format.equals("pgsql") || format.equals("sql")) { + SqlStream sqlStream = output.getSqlStream(); + SqlWriter ret; - SqlWriter.Traits tr; - if (format.equals("pgsql")) - tr = new SqlWriter.PostgresTraits(); - else - tr = new SqlWriter.MySQLTraits(); + SqlWriter.Traits tr; + if (format.equals("pgsql")) + tr = new SqlWriter.PostgresTraits(); + else + tr = new SqlWriter.MySQLTraits(); - if (param.equals("1.4")) - ret = new SqlWriter14(tr, sqlStream); - else if (param.equals("1.5")) - ret = new SqlWriter15(tr, sqlStream); - else - throw new IllegalArgumentException("SQL version not known: " + param); + if (param.equals("1.4")) + ret = new SqlWriter14(tr, sqlStream); + else if (param.equals("1.5")) + ret = new SqlWriter15(tr, sqlStream); + else + throw new IllegalArgumentException("SQL version not known: " + param); - return ret; - } else - throw new IllegalArgumentException("Output format not known: " + format); - } + return ret; + } + else + throw new IllegalArgumentException("Output format not known: " + format); + } - // ---------------- + // ---------------- - static DumpWriter addFilter(DumpWriter sink, String filter, String param) throws IOException, ParseException { - if (filter.equals("latest")) - return new LatestFilter(sink); - else if (filter.equals("namespace")) - return new NamespaceFilter(sink, param); - else if (filter.equals("notalk")) - return new NotalkFilter(sink); - else if (filter.equals("titlematch")) - return new TitleMatchFilter(sink, param); - else if (filter.equals("list")) - return new ListFilter(sink, param); - else if (filter.equals("exactlist")) - return new ExactListFilter(sink, param); - else if (filter.equals("revlist")) - return new RevisionListFilter(sink, param); - else if (filter.equals("before")) - return new BeforeTimeStampFilter(sink, param); - else if (filter.equals("after")) - return new AfterTimeStampFilter(sink, param); - else - throw new IllegalArgumentException("Filter unknown: " + filter); - } + static DumpWriter addFilter(DumpWriter sink, String filter, String param) + throws IOException, ParseException + { + if (filter.equals("latest")) + return new LatestFilter(sink); + else if (filter.equals("namespace")) + return new NamespaceFilter(sink, param); + else if (filter.equals("notalk")) + return new NotalkFilter(sink); + else if (filter.equals("titlematch")) + return new TitleMatchFilter(sink, param); + else if (filter.equals("list")) + return new ListFilter(sink, param); + else if (filter.equals("exactlist")) + return new ExactListFilter(sink, param); + else if (filter.equals("revlist")) + return new RevisionListFilter(sink, param); + else if (filter.equals("before")) + return new BeforeTimeStampFilter(sink, param); + else if (filter.equals("after")) + return new AfterTimeStampFilter(sink, param); + else + throw new IllegalArgumentException("Filter unknown: " + filter); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java index 04ac00bb..bf159e5c 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/ProgressFilter.java @@ -33,62 +33,68 @@ import org.dkpro.jwpl.mwdumper.importer.PageFilter; import org.dkpro.jwpl.mwdumper.importer.Revision; -public class ProgressFilter extends PageFilter { - int pages = 0; - int revisions = 0; - final int interval; - final MessageFormat format = new MessageFormat("{0} pages ({1}/sec), {2} revs ({3}/sec)"); - final long start = System.currentTimeMillis(); +public class ProgressFilter + extends PageFilter +{ + int pages = 0; + int revisions = 0; + final int interval; + final MessageFormat format = new MessageFormat("{0} pages ({1}/sec), {2} revs ({3}/sec)"); + final long start = System.currentTimeMillis(); - public ProgressFilter(DumpWriter sink, int interval) { - super(sink); - this.interval = interval; - if (interval <= 0) - throw new IllegalArgumentException("Reporting interval must be positive."); - } + public ProgressFilter(DumpWriter sink, int interval) + { + super(sink); + this.interval = interval; + if (interval <= 0) + throw new IllegalArgumentException("Reporting interval must be positive."); + } - public void writeStartPage(Page page) throws IOException { - super.writeStartPage(page); - pages++; - } + public void writeStartPage(Page page) throws IOException + { + super.writeStartPage(page); + pages++; + } - public void writeRevision(Revision rev) throws IOException { - super.writeRevision(rev); - revisions++; - reportProgress(); - } + public void writeRevision(Revision rev) throws IOException + { + super.writeRevision(rev); + revisions++; + reportProgress(); + } - /** - * If we didn't just show a progress report on the last revision, - * show the final results. - * - * @throws IOException - */ - public void writeEndWiki() throws IOException { - super.writeEndWiki(); - if (revisions % interval != 0) - showProgress(); - } + /** + * If we didn't just show a progress report on the last revision, show the final results. + * + * @throws IOException + */ + public void writeEndWiki() throws IOException + { + super.writeEndWiki(); + if (revisions % interval != 0) + showProgress(); + } - private void reportProgress() { - if (revisions % interval == 0) - showProgress(); - } + private void reportProgress() + { + if (revisions % interval == 0) + showProgress(); + } - private void showProgress() { - long delta = System.currentTimeMillis() - start; - sendOutput(format.format(new Object[]{ - pages, rate(delta, pages), - revisions, rate(delta, revisions)})); - } + private void showProgress() + { + long delta = System.currentTimeMillis() - start; + sendOutput(format.format( + new Object[] { pages, rate(delta, pages), revisions, rate(delta, revisions) })); + } - protected void sendOutput(String text) { - System.err.println(text); - } + protected void sendOutput(String text) + { + System.err.println(text); + } - private static Object rate(long delta, int count) { - return (delta > 0.001) - ? (Double) (1000.0 * (double) count / (double) delta) - : (Object) "-"; - } + private static Object rate(long delta, int count) + { + return (delta > 0.001) ? (Double) (1000.0 * (double) count / (double) delta) : (Object) "-"; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java index a36eed96..19efc7f6 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/dumper/Tools.java @@ -30,56 +30,64 @@ import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorOutputStream; -public class Tools { - static final int IN_BUF_SZ = 1024 * 1024; - private static final int OUT_BUF_SZ = 1024 * 1024; +public class Tools +{ + static final int IN_BUF_SZ = 1024 * 1024; + private static final int OUT_BUF_SZ = 1024 * 1024; - public static InputStream openInputFile(String arg) throws IOException { - if (arg.equals("-")) { - return openStandardInput(); + public static InputStream openInputFile(String arg) throws IOException + { + if (arg.equals("-")) { + return openStandardInput(); + } + InputStream infile = new BufferedInputStream(new FileInputStream(arg), IN_BUF_SZ); + if (arg.endsWith(".gz")) { + return new GZIPInputStream(infile); + } + else if (arg.endsWith(".bz2")) { + return openBZip2Stream(infile); + } + else { + return infile; + } } - InputStream infile = new BufferedInputStream(new FileInputStream(arg), IN_BUF_SZ); - if (arg.endsWith(".gz")) { - return new GZIPInputStream(infile); - } else if (arg.endsWith(".bz2")) { - return openBZip2Stream(infile); - } else { - return infile; - } - } - - static InputStream openStandardInput() throws IOException { - return new BufferedInputStream(System.in, IN_BUF_SZ); - } - static InputStream openBZip2Stream(InputStream infile) throws IOException { - int first = infile.read(); - int second = infile.read(); - if (first != 'B' || second != 'Z') { - throw new IOException("Didn't find BZ file signature in .bz2 file"); + static InputStream openStandardInput() throws IOException + { + return new BufferedInputStream(System.in, IN_BUF_SZ); } - return new BZip2CompressorInputStream(infile); - } - static OutputStream openStandardOutput() { - return new BufferedOutputStream(System.out, OUT_BUF_SZ); - } + static InputStream openBZip2Stream(InputStream infile) throws IOException + { + int first = infile.read(); + int second = infile.read(); + if (first != 'B' || second != 'Z') { + throw new IOException("Didn't find BZ file signature in .bz2 file"); + } + return new BZip2CompressorInputStream(infile); + } - static OutputStream createBZip2File(String param) throws IOException { - OutputStream outfile = createOutputFile(param); - // bzip2 expects a two-byte 'BZ' signature header - outfile.write('B'); - outfile.write('Z'); - return new BZip2CompressorOutputStream(outfile); - } + static OutputStream openStandardOutput() + { + return new BufferedOutputStream(System.out, OUT_BUF_SZ); + } - static OutputStream createOutputFile(String param) throws IOException { - File file = new File(param); - file.createNewFile(); - return new BufferedOutputStream(new FileOutputStream(file), OUT_BUF_SZ); - } + static OutputStream createBZip2File(String param) throws IOException + { + OutputStream outfile = createOutputFile(param); + // bzip2 expects a two-byte 'BZ' signature header + outfile.write('B'); + outfile.write('Z'); + return new BZip2CompressorOutputStream(outfile); + } + static OutputStream createOutputFile(String param) throws IOException + { + File file = new File(param); + file.createNewFile(); + return new BufferedOutputStream(new FileOutputStream(file), OUT_BUF_SZ); + } - // ---------------- + // ---------------- } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java index 258221fe..f038c07c 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/AfterTimeStampFilter.java @@ -27,15 +27,19 @@ import java.io.IOException; import java.text.ParseException; -public class AfterTimeStampFilter extends TimeStampFilter { +public class AfterTimeStampFilter + extends TimeStampFilter +{ - public AfterTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { - super(sink, timeStamp); - } + public AfterTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException + { + super(sink, timeStamp); + } - public void writeRevision(Revision revision) throws IOException { - if (revision.Timestamp.after(super.filterTimeStamp)) { - super.writeRevision(revision); + public void writeRevision(Revision revision) throws IOException + { + if (revision.Timestamp.after(super.filterTimeStamp)) { + super.writeRevision(revision); + } } - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java index 65af0bbe..eccc788c 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/BeforeTimeStampFilter.java @@ -27,15 +27,19 @@ import java.io.IOException; import java.text.ParseException; -public class BeforeTimeStampFilter extends TimeStampFilter { +public class BeforeTimeStampFilter + extends TimeStampFilter +{ - public BeforeTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { - super(sink, timeStamp); - } + public BeforeTimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException + { + super(sink, timeStamp); + } - public void writeRevision(Revision revision) throws IOException { - if (revision.Timestamp.before(super.filterTimeStamp)) { - super.writeRevision(revision); + public void writeRevision(Revision revision) throws IOException + { + if (revision.Timestamp.before(super.filterTimeStamp)) { + super.writeRevision(revision); + } } - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java index e9cb0396..0622b3d7 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Buffer.java @@ -27,39 +27,44 @@ import java.util.IdentityHashMap; -public final class Buffer { +public final class Buffer +{ - private Buffer() { - } + private Buffer() + { + } - private static final IdentityHashMap<Thread, char[]> BUFFERS = new IdentityHashMap<>(); + private static final IdentityHashMap<Thread, char[]> BUFFERS = new IdentityHashMap<>(); - private static Thread lastThread; - private static char[] lastBuffer; + private static Thread lastThread; + private static char[] lastBuffer; - public static synchronized char[] get(int capacity) { - final Thread thread = Thread.currentThread(); - char[] buffer; + public static synchronized char[] get(int capacity) + { + final Thread thread = Thread.currentThread(); + char[] buffer; - if (lastThread == thread) { - buffer = lastBuffer; - } else { - lastThread = thread; - buffer = lastBuffer = BUFFERS.get(thread); - } + if (lastThread == thread) { + buffer = lastBuffer; + } + else { + lastThread = thread; + buffer = lastBuffer = BUFFERS.get(thread); + } - if (buffer == null) { - buffer = lastBuffer = new char[capacity]; - BUFFERS.put(thread, buffer); - } else if (buffer.length < capacity) { - int newsize = buffer.length * 2; - if (newsize < capacity) - newsize = capacity; + if (buffer == null) { + buffer = lastBuffer = new char[capacity]; + BUFFERS.put(thread, buffer); + } + else if (buffer.length < capacity) { + int newsize = buffer.length * 2; + if (newsize < capacity) + newsize = capacity; - buffer = lastBuffer = new char[newsize]; - BUFFERS.put(thread, buffer); - } + buffer = lastBuffer = new char[newsize]; + BUFFERS.put(thread, buffer); + } - return buffer; - } + return buffer; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java index 3706b3cd..13aac13b 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Contributor.java @@ -25,18 +25,21 @@ package org.dkpro.jwpl.mwdumper.importer; -public class Contributor { - public String Username; - public int Id; - public boolean isIP = false; +public class Contributor +{ + public String Username; + public int Id; + public boolean isIP = false; - public Contributor() { - this(null, 0); - } + public Contributor() + { + this(null, 0); + } - public Contributor(String username, int id) { - Username = username; - Id = id; - } + public Contributor(String username, int id) + { + Username = username; + Id = id; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java index 0185d888..04e4f872 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/DumpWriter.java @@ -27,19 +27,20 @@ import java.io.IOException; -public interface DumpWriter { - void close() throws IOException; +public interface DumpWriter +{ + void close() throws IOException; - void writeStartWiki() throws IOException; + void writeStartWiki() throws IOException; - void writeEndWiki() throws IOException; + void writeEndWiki() throws IOException; - void writeSiteinfo(Siteinfo info) throws IOException; + void writeSiteinfo(Siteinfo info) throws IOException; - void writeStartPage(Page page) throws IOException; + void writeStartPage(Page page) throws IOException; - void writeEndPage() throws IOException; + void writeEndPage() throws IOException; - void writeRevision(Revision revision) throws IOException; - //void WriteUpload(Upload upload) throws IOException; // for the future + void writeRevision(Revision revision) throws IOException; + // void WriteUpload(Upload upload) throws IOException; // for the future } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java index fa6088d7..35e62dc7 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ExactListFilter.java @@ -27,12 +27,16 @@ import java.io.IOException; -public class ExactListFilter extends ListFilter { - public ExactListFilter(DumpWriter sink, String sourceFileName) throws IOException { - super(sink, sourceFileName); - } +public class ExactListFilter + extends ListFilter +{ + public ExactListFilter(DumpWriter sink, String sourceFileName) throws IOException + { + super(sink, sourceFileName); + } - protected boolean pass(Page page) { - return list.containsKey(page.Title.toString()); - } + protected boolean pass(Page page) + { + return list.containsKey(page.Title.toString()); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java index c94946e6..0fae7385 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/LatestFilter.java @@ -27,43 +27,53 @@ import java.io.IOException; -public class LatestFilter implements DumpWriter { - final DumpWriter sink; - Revision lastRevision; +public class LatestFilter + implements DumpWriter +{ + final DumpWriter sink; + Revision lastRevision; - public LatestFilter(DumpWriter sink) { - this.sink = sink; - } + public LatestFilter(DumpWriter sink) + { + this.sink = sink; + } - public void close() throws IOException { - sink.close(); - } + public void close() throws IOException + { + sink.close(); + } - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } + public void writeStartWiki() throws IOException + { + sink.writeStartWiki(); + } - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } + public void writeEndWiki() throws IOException + { + sink.writeEndWiki(); + } - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } + public void writeSiteinfo(Siteinfo info) throws IOException + { + sink.writeSiteinfo(info); + } - public void writeStartPage(Page page) throws IOException { - sink.writeStartPage(page); - } + public void writeStartPage(Page page) throws IOException + { + sink.writeStartPage(page); + } - public void writeEndPage() throws IOException { - if (lastRevision != null) { - sink.writeRevision(lastRevision); - lastRevision = null; + public void writeEndPage() throws IOException + { + if (lastRevision != null) { + sink.writeRevision(lastRevision); + lastRevision = null; + } + sink.writeEndPage(); } - sink.writeEndPage(); - } - public void writeRevision(Revision revision) { - lastRevision = revision; - } + public void writeRevision(Revision revision) + { + lastRevision = revision; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java index 8701ea0a..77ecfb89 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/ListFilter.java @@ -34,32 +34,37 @@ import java.util.HashMap; import java.util.Map; -public class ListFilter extends PageFilter { - protected final Map<String, String> list; +public class ListFilter + extends PageFilter +{ + protected final Map<String, String> list; - public ListFilter(DumpWriter sink, String sourceFileName) throws IOException { - super(sink); - list = new HashMap<>(); - BufferedReader input = new BufferedReader(new InputStreamReader(new BufferedInputStream( - new FileInputStream(sourceFileName)), StandardCharsets.UTF_8)); - String line = input.readLine(); - while (line != null) { - if (!line.startsWith("#")) { - String title = line.trim(); - title = title.replace("_", " "); - if (title.startsWith(":")) - title = line.substring(1); + public ListFilter(DumpWriter sink, String sourceFileName) throws IOException + { + super(sink); + list = new HashMap<>(); + BufferedReader input = new BufferedReader( + new InputStreamReader(new BufferedInputStream(new FileInputStream(sourceFileName)), + StandardCharsets.UTF_8)); + String line = input.readLine(); + while (line != null) { + if (!line.startsWith("#")) { + String title = line.trim(); + title = title.replace("_", " "); + if (title.startsWith(":")) + title = line.substring(1); - if (title.length() > 0) - list.put(title, title); - } - line = input.readLine(); + if (title.length() > 0) + list.put(title, title); + } + line = input.readLine(); + } + input.close(); } - input.close(); - } - protected boolean pass(Page page) { - return list.containsKey(page.Title.subjectPage().toString()) - || list.containsKey(page.Title.talkPage().toString()); - } + protected boolean pass(Page page) + { + return list.containsKey(page.Title.subjectPage().toString()) + || list.containsKey(page.Title.talkPage().toString()); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java index cd93c524..dfe89b80 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/MultiWriter.java @@ -29,63 +29,74 @@ import java.util.ArrayList; import java.util.List; -public class MultiWriter implements DumpWriter { - private final List<DumpWriter> sinks; +public class MultiWriter + implements DumpWriter +{ + private final List<DumpWriter> sinks; - public MultiWriter() { - sinks = new ArrayList<>(); - } + public MultiWriter() + { + sinks = new ArrayList<>(); + } - public void close() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.close(); + public void close() throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.close(); + } } - } - public void writeStartWiki() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeStartWiki(); + public void writeStartWiki() throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeStartWiki(); + } } - } - public void writeEndWiki() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeEndWiki(); + public void writeEndWiki() throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeEndWiki(); + } } - } - public void writeSiteinfo(Siteinfo info) throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeSiteinfo(info); + public void writeSiteinfo(Siteinfo info) throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeSiteinfo(info); + } } - } - public void writeStartPage(Page page) throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeStartPage(page); + public void writeStartPage(Page page) throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeStartPage(page); + } } - } - public void writeEndPage() throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeEndPage(); + public void writeEndPage() throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeEndPage(); + } } - } - public void writeRevision(Revision revision) throws IOException { - for (int i = 0; i < sinks.size(); i++) { - DumpWriter sink = sinks.get(i); - sink.writeRevision(revision); + public void writeRevision(Revision revision) throws IOException + { + for (int i = 0; i < sinks.size(); i++) { + DumpWriter sink = sinks.get(i); + sink.writeRevision(revision); + } } - } - public void add(DumpWriter sink) { - sinks.add(sink); - } + public void add(DumpWriter sink) + { + sinks.add(sink); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java index 9bb0a053..965f8866 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceFilter.java @@ -28,53 +28,45 @@ import java.util.HashMap; import java.util.Map; -public class NamespaceFilter extends PageFilter { - final boolean invert; - final Map<Integer, String> matches; +public class NamespaceFilter + extends PageFilter +{ + final boolean invert; + final Map<Integer, String> matches; - public NamespaceFilter(DumpWriter sink, String configString) { - super(sink); + public NamespaceFilter(DumpWriter sink, String configString) + { + super(sink); - invert = configString.startsWith("!"); - if (invert) - configString = configString.substring(1); - matches = new HashMap<>(); + invert = configString.startsWith("!"); + if (invert) + configString = configString.substring(1); + matches = new HashMap<>(); - String[] namespaceKeys = { - "NS_MAIN", - "NS_TALK", - "NS_USER", - "NS_USER_TALK", - "NS_PROJECT", - "NS_PROJECT_TALK", - "NS_IMAGE", - "NS_IMAGE_TALK", - "NS_MEDIAWIKI", - "NS_MEDIAWIKI_TALK", - "NS_TEMPLATE", - "NS_TEMPLATE_TALK", - "NS_HELP", - "NS_HELP_TALK", - "NS_CATEGORY", - "NS_CATEGORY_TALK"}; + String[] namespaceKeys = { "NS_MAIN", "NS_TALK", "NS_USER", "NS_USER_TALK", "NS_PROJECT", + "NS_PROJECT_TALK", "NS_IMAGE", "NS_IMAGE_TALK", "NS_MEDIAWIKI", "NS_MEDIAWIKI_TALK", + "NS_TEMPLATE", "NS_TEMPLATE_TALK", "NS_HELP", "NS_HELP_TALK", "NS_CATEGORY", + "NS_CATEGORY_TALK" }; - String[] itemList = configString.trim().split(","); - for (int i = 0; i < itemList.length; i++) { - String keyString = itemList[i]; - String trimmed = keyString.trim(); - try { - int key = Integer.parseInt(trimmed); - matches.put(key, trimmed); - } catch (NumberFormatException e) { - for (int key = 0; key < namespaceKeys.length; key++) { - if (trimmed.equalsIgnoreCase(namespaceKeys[key])) - matches.put(key, trimmed); + String[] itemList = configString.trim().split(","); + for (int i = 0; i < itemList.length; i++) { + String keyString = itemList[i]; + String trimmed = keyString.trim(); + try { + int key = Integer.parseInt(trimmed); + matches.put(key, trimmed); + } + catch (NumberFormatException e) { + for (int key = 0; key < namespaceKeys.length; key++) { + if (trimmed.equalsIgnoreCase(namespaceKeys[key])) + matches.put(key, trimmed); + } + } } - } } - } - protected boolean pass(Page page) { - return invert ^ matches.containsKey(page.Title.Namespace); - } + protected boolean pass(Page page) + { + return invert ^ matches.containsKey(page.Title.Namespace); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java index ef3f1c95..9ec40fde 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NamespaceSet.java @@ -30,44 +30,53 @@ import java.util.LinkedHashMap; import java.util.Map; -public class NamespaceSet { - private final Map<String, Integer> byname; - private final Map<Integer, String> bynumber; +public class NamespaceSet +{ + private final Map<String, Integer> byname; + private final Map<Integer, String> bynumber; - public NamespaceSet() { - byname = new HashMap<>(); - bynumber = new LinkedHashMap<>(); - } + public NamespaceSet() + { + byname = new HashMap<>(); + bynumber = new LinkedHashMap<>(); + } - public void add(Integer index, String prefix) { - byname.put(prefix, index); - bynumber.put(index, prefix); - } + public void add(Integer index, String prefix) + { + byname.put(prefix, index); + bynumber.put(index, prefix); + } - public boolean hasPrefix(String prefix) { - return byname.containsKey(prefix); - } + public boolean hasPrefix(String prefix) + { + return byname.containsKey(prefix); + } - public boolean hasIndex(Integer index) { - return bynumber.containsKey(index); - } + public boolean hasIndex(Integer index) + { + return bynumber.containsKey(index); + } - public String getPrefix(Integer index) { - return bynumber.get(index); - } + public String getPrefix(Integer index) + { + return bynumber.get(index); + } - public Integer getIndex(String prefix) { - return byname.get(prefix); - } + public Integer getIndex(String prefix) + { + return byname.get(prefix); + } - public String getColonPrefix(Integer index) { - String prefix = getPrefix(index); - if (index != 0) - return prefix.concat(":"); - return prefix; - } + public String getColonPrefix(Integer index) + { + String prefix = getPrefix(index); + if (index != 0) + return prefix.concat(":"); + return prefix; + } - public Iterator<Map.Entry<Integer, String>> orderedEntries() { - return bynumber.entrySet().iterator(); - } + public Iterator<Map.Entry<Integer, String>> orderedEntries() + { + return bynumber.entrySet().iterator(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java index f8b8bcf3..c637c715 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/NotalkFilter.java @@ -25,12 +25,16 @@ package org.dkpro.jwpl.mwdumper.importer; -public class NotalkFilter extends PageFilter { - public NotalkFilter(DumpWriter sink) { - super(sink); - } +public class NotalkFilter + extends PageFilter +{ + public NotalkFilter(DumpWriter sink) + { + super(sink); + } - protected boolean pass(Page page) { - return !page.Title.isTalk(); - } + protected boolean pass(Page page) + { + return !page.Title.isTalk(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java index 91cdb13d..da12d4c4 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Page.java @@ -27,15 +27,17 @@ import java.util.Hashtable; -public class Page { - public Title Title; - public int Id; - public final Hashtable<String, Object> DiscussionThreadingInfo; - public String Restrictions; +public class Page +{ + public Title Title; + public int Id; + public final Hashtable<String, Object> DiscussionThreadingInfo; + public String Restrictions; - public Page() { - // <restrictions> is optional... - Restrictions = ""; - DiscussionThreadingInfo = new Hashtable<>(); - } + public Page() + { + // <restrictions> is optional... + Restrictions = ""; + DiscussionThreadingInfo = new Hashtable<>(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java index 26b28d5d..274eb54e 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/PageFilter.java @@ -27,47 +27,58 @@ import java.io.IOException; -public abstract class PageFilter implements DumpWriter { - final DumpWriter sink; - boolean showThisPage; +public abstract class PageFilter + implements DumpWriter +{ + final DumpWriter sink; + boolean showThisPage; - public PageFilter(DumpWriter sink) { - this.sink = sink; - } + public PageFilter(DumpWriter sink) + { + this.sink = sink; + } - public void close() throws IOException { - sink.close(); - } + public void close() throws IOException + { + sink.close(); + } - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } + public void writeStartWiki() throws IOException + { + sink.writeStartWiki(); + } - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } + public void writeEndWiki() throws IOException + { + sink.writeEndWiki(); + } - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } + public void writeSiteinfo(Siteinfo info) throws IOException + { + sink.writeSiteinfo(info); + } - public void writeStartPage(Page page) throws IOException { - showThisPage = pass(page); - if (showThisPage) - sink.writeStartPage(page); - } + public void writeStartPage(Page page) throws IOException + { + showThisPage = pass(page); + if (showThisPage) + sink.writeStartPage(page); + } - public void writeEndPage() throws IOException { - if (showThisPage) - sink.writeEndPage(); - } + public void writeEndPage() throws IOException + { + if (showThisPage) + sink.writeEndPage(); + } - public void writeRevision(Revision revision) throws IOException { - if (showThisPage) - sink.writeRevision(revision); - } + public void writeRevision(Revision revision) throws IOException + { + if (showThisPage) + sink.writeRevision(revision); + } - protected boolean pass(Page page) { - return true; - } + protected boolean pass(Page page) + { + return true; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java index 611f9b6e..da029ad8 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Revision.java @@ -27,22 +27,25 @@ import java.util.Calendar; -public class Revision { - public int Id; - public Calendar Timestamp; - public Contributor Contributor; - public String Comment; - public String Text; - public boolean Minor; +public class Revision +{ + public int Id; + public Calendar Timestamp; + public Contributor Contributor; + public String Comment; + public String Text; + public boolean Minor; - public boolean isRedirect() { - // FIXME - return Text.startsWith("#REDIRECT ") || Text.startsWith("#redirect "); - } + public boolean isRedirect() + { + // FIXME + return Text.startsWith("#REDIRECT ") || Text.startsWith("#redirect "); + } - public Revision() { - Comment = ""; - Text = ""; - Minor = false; - } + public Revision() + { + Comment = ""; + Text = ""; + Minor = false; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java index a0c6bd08..5dc805bd 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/RevisionListFilter.java @@ -34,62 +34,73 @@ import java.util.Set; import java.util.TreeSet; -public class RevisionListFilter implements DumpWriter { - final DumpWriter sink; - protected final Set<String> revIds; - protected Page currentPage; - protected boolean pageWritten; +public class RevisionListFilter + implements DumpWriter +{ + final DumpWriter sink; + protected final Set<String> revIds; + protected Page currentPage; + protected boolean pageWritten; - public RevisionListFilter(DumpWriter sink, String sourceFileName) throws IOException { - this.sink = sink; - revIds = new TreeSet<>(); - BufferedReader input = new BufferedReader(new InputStreamReader(new BufferedInputStream( - new FileInputStream(sourceFileName)), StandardCharsets.UTF_8)); - String line = input.readLine(); - while (line != null) { - line = line.trim(); - if (line.length() > 0 && !line.startsWith("#")) { - revIds.add(line); - } - line = input.readLine(); + public RevisionListFilter(DumpWriter sink, String sourceFileName) throws IOException + { + this.sink = sink; + revIds = new TreeSet<>(); + BufferedReader input = new BufferedReader( + new InputStreamReader(new BufferedInputStream(new FileInputStream(sourceFileName)), + StandardCharsets.UTF_8)); + String line = input.readLine(); + while (line != null) { + line = line.trim(); + if (line.length() > 0 && !line.startsWith("#")) { + revIds.add(line); + } + line = input.readLine(); + } + input.close(); } - input.close(); - } - public void close() throws IOException { - sink.close(); - } + public void close() throws IOException + { + sink.close(); + } - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } + public void writeStartWiki() throws IOException + { + sink.writeStartWiki(); + } - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } + public void writeEndWiki() throws IOException + { + sink.writeEndWiki(); + } - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } + public void writeSiteinfo(Siteinfo info) throws IOException + { + sink.writeSiteinfo(info); + } - public void writeStartPage(Page page) throws IOException { - currentPage = page; - pageWritten = false; - } + public void writeStartPage(Page page) throws IOException + { + currentPage = page; + pageWritten = false; + } - public void writeEndPage() throws IOException { - if (pageWritten) { - sink.writeEndPage(); + public void writeEndPage() throws IOException + { + if (pageWritten) { + sink.writeEndPage(); + } } - } - public void writeRevision(Revision revision) throws IOException { - if (revIds.contains(Integer.valueOf(revision.Id).toString())) { - if (!pageWritten) { - sink.writeStartPage(currentPage); - pageWritten = true; - } - sink.writeRevision(revision); + public void writeRevision(Revision revision) throws IOException + { + if (revIds.contains(Integer.valueOf(revision.Id).toString())) { + if (!pageWritten) { + sink.writeStartPage(currentPage); + pageWritten = true; + } + sink.writeRevision(revision); + } } - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java index 846dcddd..36db12b6 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Siteinfo.java @@ -25,10 +25,11 @@ package org.dkpro.jwpl.mwdumper.importer; -public class Siteinfo { - public String Sitename; - public String Base; - public String Generator; - public String Case; - public NamespaceSet Namespaces; +public class Siteinfo +{ + public String Sitename; + public String Base; + public String Generator; + public String Case; + public NamespaceSet Namespaces; } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java index 8d29b875..5fb69b34 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SphinxWriter.java @@ -32,63 +32,73 @@ /** * Generates XML stream suitable for the Sphinx search engine's xmlpipe input. */ -public class SphinxWriter implements DumpWriter { - protected final OutputStream stream; - protected final XmlWriter writer; - protected Page _page; - protected Revision _rev; +public class SphinxWriter + implements DumpWriter +{ + protected final OutputStream stream; + protected final XmlWriter writer; + protected Page _page; + protected Revision _rev; - public SphinxWriter(OutputStream output) { - stream = output; - writer = new XmlWriter(stream); - } + public SphinxWriter(OutputStream output) + { + stream = output; + writer = new XmlWriter(stream); + } - public void close() throws IOException { - writer.close(); - } + public void close() throws IOException + { + writer.close(); + } - public void writeStartWiki() throws IOException { - writer.openXml(); - // No containing element to open - } + public void writeStartWiki() throws IOException + { + writer.openXml(); + // No containing element to open + } - public void writeEndWiki() throws IOException { - // No containing element to close - writer.closeXml(); - } + public void writeEndWiki() throws IOException + { + // No containing element to close + writer.closeXml(); + } - public void writeSiteinfo(Siteinfo info) throws IOException { - // Nothing! - } + public void writeSiteinfo(Siteinfo info) throws IOException + { + // Nothing! + } - public void writeStartPage(Page page) throws IOException { - _page = page; - } + public void writeStartPage(Page page) throws IOException + { + _page = page; + } - /** - * FIXME What's the "group" number here do? - * FIXME preprocess the text to strip some formatting? - */ - public void writeEndPage() throws IOException { - writer.openElement("document"); - writer.textElement("id", Integer.toString(_page.Id)); - writer.textElement("group", "0"); - writer.textElement("timestamp", formatTimestamp(_rev.Timestamp)); - writer.textElement("title", _page.Title.toString()); - writer.textElement("body", _rev.Text); - writer.closeElement(); - _rev = null; - _page = null; - } + /** + * FIXME What's the "group" number here do? FIXME preprocess the text to strip some formatting? + */ + public void writeEndPage() throws IOException + { + writer.openElement("document"); + writer.textElement("id", Integer.toString(_page.Id)); + writer.textElement("group", "0"); + writer.textElement("timestamp", formatTimestamp(_rev.Timestamp)); + writer.textElement("title", _page.Title.toString()); + writer.textElement("body", _rev.Text); + writer.closeElement(); + _rev = null; + _page = null; + } - public void writeRevision(Revision rev) throws IOException { - _rev = rev; - } + public void writeRevision(Revision rev) throws IOException + { + _rev = rev; + } - /** - * FIXME double-check that it wants Unix timestamp - */ - static String formatTimestamp(Calendar ts) { - return Long.toString(ts.getTimeInMillis() / 1000L); - } + /** + * FIXME double-check that it wants Unix timestamp + */ + static String formatTimestamp(Calendar ts) + { + return Long.toString(ts.getTimeInMillis() / 1000L); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java index 7037f7be..521544f6 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlFileStream.java @@ -22,24 +22,30 @@ import java.io.PrintStream; import java.nio.charset.StandardCharsets; -public class SqlFileStream implements SqlStream { - protected final PrintStream stream; +public class SqlFileStream + implements SqlStream +{ + protected final PrintStream stream; - public SqlFileStream(OutputStream output) throws IOException { - this.stream = new PrintStream(output, false, StandardCharsets.UTF_8); - } + public SqlFileStream(OutputStream output) throws IOException + { + this.stream = new PrintStream(output, false, StandardCharsets.UTF_8); + } - public void writeComment(CharSequence sql) { - stream.println(sql.toString()); - } + public void writeComment(CharSequence sql) + { + stream.println(sql.toString()); + } - public void writeStatement(CharSequence sql) { - stream.print(sql.toString()); - stream.println(';'); - } + public void writeStatement(CharSequence sql) + { + stream.print(sql.toString()); + stream.println(';'); + } - public void close() { - stream.flush(); - stream.close(); - } + public void close() + { + stream.flush(); + stream.close(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java index 5c15b0e5..3e0dc464 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlLiteral.java @@ -28,14 +28,17 @@ /** * Quickie wrapper class for including literal SQL expressions. */ -public class SqlLiteral { - final String contents; +public class SqlLiteral +{ + final String contents; - public SqlLiteral(String contents) { - this.contents = contents; - } + public SqlLiteral(String contents) + { + this.contents = contents; + } - public String toString() { - return contents; - } + public String toString() + { + return contents; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java index c33e5815..cfd1fce7 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlServerStream.java @@ -23,36 +23,45 @@ import java.sql.SQLWarning; import java.sql.Statement; -public class SqlServerStream implements SqlStream { - private final Connection connection; +public class SqlServerStream + implements SqlStream +{ + private final Connection connection; - public SqlServerStream(Connection conn) { - connection = conn; // TODO - } + public SqlServerStream(Connection conn) + { + connection = conn; // TODO + } - public void writeComment(CharSequence sql) { - // do nothing - } + public void writeComment(CharSequence sql) + { + // do nothing + } - public void writeStatement(CharSequence sql) throws IOException { - Statement statement; - try { - statement = connection.createStatement(); - statement.setEscapeProcessing(false); - statement.execute(sql.toString()); - } catch (SQLException e) { - throw new IOException(e.toString()); + public void writeStatement(CharSequence sql) throws IOException + { + Statement statement; + try { + statement = connection.createStatement(); + statement.setEscapeProcessing(false); + statement.execute(sql.toString()); + } + catch (SQLException e) { + throw new IOException(e.toString()); + } } - } - public void close() throws IOException { - try { - connection.close(); - } catch (SQLWarning e) { - e.printStackTrace(); - } catch (SQLException e) { - throw new IOException(e.toString()); + public void close() throws IOException + { + try { + connection.close(); + } + catch (SQLWarning e) { + e.printStackTrace(); + } + catch (SQLException e) { + throw new IOException(e.toString()); + } } - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java index 5c5c4a1e..2fd7b39a 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlStream.java @@ -19,10 +19,11 @@ import java.io.IOException; -public interface SqlStream { - void writeComment(CharSequence sql) throws IOException; +public interface SqlStream +{ + void writeComment(CharSequence sql) throws IOException; - void writeStatement(CharSequence sql) throws IOException; + void writeStatement(CharSequence sql) throws IOException; - void close() throws IOException; + void close() throws IOException; } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java index 609f2f16..af7d08ba 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter.java @@ -34,345 +34,386 @@ import java.util.Map; import java.util.TimeZone; +public abstract class SqlWriter + implements DumpWriter +{ + public static abstract class Traits + { + public abstract SqlLiteral getCurrentTime(); -public abstract class SqlWriter implements DumpWriter { - public static abstract class Traits { - public abstract SqlLiteral getCurrentTime(); + public abstract SqlLiteral getRandom(); - public abstract SqlLiteral getRandom(); + public abstract String getTextTable(); - public abstract String getTextTable(); + public abstract boolean supportsMultiRowInsert(); - public abstract boolean supportsMultiRowInsert(); + public abstract MessageFormat getTimestampFormatter(); - public abstract MessageFormat getTimestampFormatter(); + public String getWikiPrologue() + { + return null; + } - public String getWikiPrologue() { - return null; + public String getWikiEpilogue() + { + return null; + } } - public String getWikiEpilogue() { - return null; + public static class MySQLTraits + extends Traits + { + // UTC_TIMESTAMP() is new in MySQL 4.1 or 5.0, so using this + // godawful hack found in documentation comments: + public SqlLiteral getCurrentTime() + { + return new SqlLiteral("DATE_ADD('1970-01-01', INTERVAL UNIX_TIMESTAMP() SECOND)+0"); + } + + public SqlLiteral getRandom() + { + return new SqlLiteral("RAND()"); + } + + public boolean supportsMultiRowInsert() + { + return true; + } + + public String getTextTable() + { + return "text"; + } + + private static final MessageFormat timestampFormatter = new MessageFormat( + "{0,number,0000}{1,number,00}{2,number,00}{3,number,00}{4,number,00}{5,number,00}"); + + public MessageFormat getTimestampFormatter() + { + return timestampFormatter; + } } - } - public static class MySQLTraits extends Traits { - // UTC_TIMESTAMP() is new in MySQL 4.1 or 5.0, so using this - // godawful hack found in documentation comments: - public SqlLiteral getCurrentTime() { - return new SqlLiteral("DATE_ADD('1970-01-01', INTERVAL UNIX_TIMESTAMP() SECOND)+0"); + public static class PostgresTraits + extends Traits + { + public SqlLiteral getCurrentTime() + { + return new SqlLiteral("current_timestamp AT TIME ZONE 'UTC'"); + } + + public SqlLiteral getRandom() + { + return new SqlLiteral("RANDOM()"); + } + + public boolean supportsMultiRowInsert() + { + return false; + } + + public String getTextTable() + { + return "pagecontent"; + } + + private static final MessageFormat timestampFormatter = new MessageFormat( + "{0,number,0000}-{1,number,00}-{2,number,00} {3,number,00}:{4,number,00}:{5,number,00}"); + + public MessageFormat getTimestampFormatter() + { + return timestampFormatter; + } + + public String getWikiPrologue() + { + return "ALTER TABLE revision DISABLE TRIGGER ALL;" + + "ALTER TABLE page DISABLE TRIGGER ALL;"; + } + + public String getWikiEpilogue() + { + return "ALTER TABLE revision ENABLE TRIGGER ALL;" + + "ALTER TABLE page ENABLE TRIGGER ALL;"; + } } - public SqlLiteral getRandom() { - return new SqlLiteral("RAND()"); + private final SqlStream stream; + private String tablePrefix = ""; + + protected static final Integer ONE = 1; + protected static final Integer ZERO = 0; + protected final Traits traits; + + public SqlWriter(Traits tr, SqlStream output) + { + stream = output; + traits = tr; } - public boolean supportsMultiRowInsert() { - return true; + public SqlWriter(Traits tr, SqlStream output, String prefix) + { + stream = output; + tablePrefix = prefix; + traits = tr; } - public String getTextTable() { - return "text"; + public void close() throws IOException + { + stream.close(); } - private static final MessageFormat timestampFormatter = new MessageFormat( - "{0,number,0000}{1,number,00}{2,number,00}{3,number,00}{4,number,00}{5,number,00}"); + public void writeStartWiki() throws IOException + { + stream.writeComment("-- MediaWiki XML dump converted to SQL by mwdumper"); + stream.writeStatement("BEGIN"); - public MessageFormat getTimestampFormatter() { - return timestampFormatter; + String prologue = traits.getWikiPrologue(); + if (prologue != null) + stream.writeStatement(prologue); } - } - public static class PostgresTraits extends Traits { - public SqlLiteral getCurrentTime() { - return new SqlLiteral("current_timestamp AT TIME ZONE 'UTC'"); - } + public void writeEndWiki() throws IOException + { + flushInsertBuffers(); - public SqlLiteral getRandom() { - return new SqlLiteral("RANDOM()"); + String epilogue = traits.getWikiEpilogue(); + if (epilogue != null) + stream.writeStatement(epilogue); + stream.writeStatement("COMMIT"); + stream.writeComment("-- DONE"); } - public boolean supportsMultiRowInsert() { - return false; + public void writeSiteinfo(Siteinfo info) throws IOException + { + stream.writeComment(""); + stream.writeComment("-- Site: " + commentSafe(info.Sitename)); + stream.writeComment("-- URL: " + commentSafe(info.Base)); + stream.writeComment("-- Generator: " + commentSafe(info.Generator)); + stream.writeComment("-- Case: " + commentSafe(info.Case)); + stream.writeComment("--"); + stream.writeComment("-- Namespaces:"); + for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i + .hasNext();) { + Map.Entry<Integer, String> e = i.next(); + stream.writeComment("-- " + e.getKey() + ": " + e.getValue()); + } + stream.writeComment(""); } - public String getTextTable() { - return "pagecontent"; - } + public abstract void writeStartPage(Page page) throws IOException; - private static final MessageFormat timestampFormatter = new MessageFormat( - "{0,number,0000}-{1,number,00}-{2,number,00} {3,number,00}:{4,number,00}:{5,number,00}"); + public abstract void writeEndPage() throws IOException; - public MessageFormat getTimestampFormatter() { - return timestampFormatter; + public abstract void writeRevision(Revision revision) throws IOException; + + protected String commentSafe(String text) + { + return text; } - public String getWikiPrologue() { - return - "ALTER TABLE revision DISABLE TRIGGER ALL;" + - "ALTER TABLE page DISABLE TRIGGER ALL;"; + private final Map<CharSequence, StringBuffer> insertBuffers = new HashMap<>(); + private static final int blockSize = 1024 * 512; // default 512k inserts + + protected void bufferInsertRow(String table, Object[][] row) throws IOException + { + StringBuffer sql = insertBuffers.get(table); + if (sql != null) { + if (traits.supportsMultiRowInsert() && (sql.length() < blockSize)) { + sql.append(','); + appendInsertValues(sql, row); + return; + } + else { + flushInsertBuffer(table); + } + } + sql = new StringBuffer(blockSize); + synchronized (sql) { // only for StringBuffer + appendInsertStatement(sql, table, row); + insertBuffers.put(table, sql); + } } - public String getWikiEpilogue() { - return - "ALTER TABLE revision ENABLE TRIGGER ALL;" + - "ALTER TABLE page ENABLE TRIGGER ALL;"; + protected void flushInsertBuffer(String table) throws IOException + { + stream.writeStatement(insertBuffers.get(table)); + insertBuffers.remove(table); } - } - - private final SqlStream stream; - private String tablePrefix = ""; - - protected static final Integer ONE = 1; - protected static final Integer ZERO = 0; - protected final Traits traits; - - public SqlWriter(Traits tr, SqlStream output) { - stream = output; - traits = tr; - } - - public SqlWriter(Traits tr, SqlStream output, String prefix) { - stream = output; - tablePrefix = prefix; - traits = tr; - } - - public void close() throws IOException { - stream.close(); - } - - public void writeStartWiki() throws IOException { - stream.writeComment("-- MediaWiki XML dump converted to SQL by mwdumper"); - stream.writeStatement("BEGIN"); - - String prologue = traits.getWikiPrologue(); - if (prologue != null) - stream.writeStatement(prologue); - } - - public void writeEndWiki() throws IOException { - flushInsertBuffers(); - - String epilogue = traits.getWikiEpilogue(); - if (epilogue != null) - stream.writeStatement(epilogue); - stream.writeStatement("COMMIT"); - stream.writeComment("-- DONE"); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - stream.writeComment(""); - stream.writeComment("-- Site: " + commentSafe(info.Sitename)); - stream.writeComment("-- URL: " + commentSafe(info.Base)); - stream.writeComment("-- Generator: " + commentSafe(info.Generator)); - stream.writeComment("-- Case: " + commentSafe(info.Case)); - stream.writeComment("--"); - stream.writeComment("-- Namespaces:"); - for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i.hasNext(); ) { - Map.Entry<Integer, String> e = i.next(); - stream.writeComment("-- " + e.getKey() + ": " + e.getValue()); + + protected void flushInsertBuffers() throws IOException + { + for (StringBuffer stringBuffer : insertBuffers.values()) { + stream.writeStatement(stringBuffer); + } + insertBuffers.clear(); } - stream.writeComment(""); - } - public abstract void writeStartPage(Page page) throws IOException; + protected void insertRow(String table, Object[][] row) throws IOException + { + StringBuffer sql = new StringBuffer(65536); + appendInsertStatement(sql, table, row); + stream.writeStatement(sql); + } - public abstract void writeEndPage() throws IOException; + private void appendInsertStatement(StringBuffer sql, String table, Object[][] row) + { + sql.append("INSERT INTO "); + sql.append(tablePrefix); + sql.append(table); + sql.append(" ("); + + for (int i = 0; i < row.length; i++) { + String field = (String) row[i][0]; + if (i > 0) + sql.append(','); + sql.append(field); + } + sql.append(") VALUES "); + appendInsertValues(sql, row); + } - public abstract void writeRevision(Revision revision) throws IOException; + private static void appendInsertValues(StringBuffer sql, Object[][] row) + { + sql.append('('); + for (int i = 0; i < row.length; i++) { + Object val = row[i][1]; + if (i > 0) + sql.append(','); + sql.append(sqlSafe(val)); + } + sql.append(')'); + } - protected String commentSafe(String text) { - return text; - } + protected void updateRow(String table, Object[][] row, String keyField, Object keyValue) + throws IOException + { + StringBuffer sql = new StringBuffer(65536); + synchronized (sql) { // only for StringBuffer + sql.append("UPDATE "); + sql.append(tablePrefix); + sql.append(table); + sql.append(" SET "); + + for (int i = 0; i < row.length; i++) { + String field = (String) row[i][0]; + Object val = row[i][1]; + if (i > 0) + sql.append(','); + sql.append(field); + sql.append('='); + sql.append(sqlSafe(val)); + } + + sql.append(" WHERE "); + sql.append(keyField); + sql.append('='); + sql.append(sqlSafe(keyValue)); + + stream.writeStatement(sql); + } + } - private final Map<CharSequence, StringBuffer> insertBuffers = new HashMap<>(); - private static final int blockSize = 1024 * 512; // default 512k inserts + protected static String sqlSafe(Object val) + { + if (val == null) + return "NULL"; - protected void bufferInsertRow(String table, Object[][] row) throws IOException { - StringBuffer sql = insertBuffers.get(table); - if (sql != null) { - if (traits.supportsMultiRowInsert() && (sql.length() < blockSize)) { - sql.append(','); - appendInsertValues(sql, row); - return; - } else { - flushInsertBuffer(table); - } - } - sql = new StringBuffer(blockSize); - synchronized (sql) { //only for StringBuffer - appendInsertStatement(sql, table, row); - insertBuffers.put(table, sql); + String str = val.toString(); + if (val instanceof String) { + return sqlEscape(str); + } + else if (val instanceof Integer) { + return str; + } + else if (val instanceof Double) { + return str; + } + else if (val instanceof SqlLiteral) { + return str; + } + else { + throw new IllegalArgumentException("Unknown type in SQL"); + } } - } - - protected void flushInsertBuffer(String table) throws IOException { - stream.writeStatement(insertBuffers.get(table)); - insertBuffers.remove(table); - } - protected void flushInsertBuffers() throws IOException { - for (StringBuffer stringBuffer : insertBuffers.values()) { - stream.writeStatement(stringBuffer); + protected static String sqlEscape(String str) + { + if (str.length() == 0) + return "''"; // TODO "NULL",too ? + final int len = str.length(); + StringBuffer sql = new StringBuffer(len * 2); + synchronized (sql) { // only for StringBuffer + sql.append('\''); + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + switch (c) { + case '\u0000': + sql.append('\\').append('0'); + break; + case '\n': + sql.append('\\').append('n'); + break; + case '\r': + sql.append('\\').append('r'); + break; + case '\u001a': + sql.append('\\').append('Z'); + break; + case '"': + case '\'': + case '\\': + sql.append('\\'); + // fall through + default: + sql.append(c); + break; + } + } + sql.append('\''); + return sql.toString(); + } } - insertBuffers.clear(); - } - - protected void insertRow(String table, Object[][] row) throws IOException { - StringBuffer sql = new StringBuffer(65536); - appendInsertStatement(sql, table, row); - stream.writeStatement(sql); - } - - private void appendInsertStatement(StringBuffer sql, String table, Object[][] row) { - sql.append("INSERT INTO "); - sql.append(tablePrefix); - sql.append(table); - sql.append(" ("); - - for (int i = 0; i < row.length; i++) { - String field = (String) row[i][0]; - if (i > 0) - sql.append(','); - sql.append(field); + + protected static String titleFormat(String title) + { + return title.replace(' ', '_'); } - sql.append(") VALUES "); - appendInsertValues(sql, row); - } - - private static void appendInsertValues(StringBuffer sql, Object[][] row) { - sql.append('('); - for (int i = 0; i < row.length; i++) { - Object val = row[i][1]; - if (i > 0) - sql.append(','); - sql.append(sqlSafe(val)); + + protected String timestampFormat(Calendar time) + { + return traits.getTimestampFormatter() + .format(new Object[] { time.get(Calendar.YEAR), time.get(Calendar.MONTH) + 1, + time.get(Calendar.DAY_OF_MONTH), time.get(Calendar.HOUR_OF_DAY), + time.get(Calendar.MINUTE), time.get(Calendar.SECOND) }); } - sql.append(')'); - } - - protected void updateRow(String table, Object[][] row, String keyField, Object keyValue) throws IOException { - StringBuffer sql = new StringBuffer(65536); - synchronized (sql) { //only for StringBuffer - sql.append("UPDATE "); - sql.append(tablePrefix); - sql.append(table); - sql.append(" SET "); - - for (int i = 0; i < row.length; i++) { - String field = (String) row[i][0]; - Object val = row[i][1]; - if (i > 0) - sql.append(','); - sql.append(field); - sql.append('='); - sql.append(sqlSafe(val)); - } - - sql.append(" WHERE "); - sql.append(keyField); - sql.append('='); - sql.append(sqlSafe(keyValue)); - - stream.writeStatement(sql); + + protected String inverseTimestamp(Calendar time) + { + return traits.getTimestampFormatter() + .format(new Object[] { 9999 - time.get(Calendar.YEAR), + 99 - time.get(Calendar.MONTH) - 1, 99 - time.get(Calendar.DAY_OF_MONTH), + 99 - time.get(Calendar.HOUR_OF_DAY), 99 - time.get(Calendar.MINUTE), + 99 - time.get(Calendar.SECOND) }); } - } - - protected static String sqlSafe(Object val) { - if (val == null) - return "NULL"; - - String str = val.toString(); - if (val instanceof String) { - return sqlEscape(str); - } else if (val instanceof Integer) { - return str; - } else if (val instanceof Double) { - return str; - } else if (val instanceof SqlLiteral) { - return str; - } else { - throw new IllegalArgumentException("Unknown type in SQL"); + + private static final TimeZone utc = TimeZone.getTimeZone("UTC"); + + protected static GregorianCalendar now() + { + return new GregorianCalendar(utc); } - } - - protected static String sqlEscape(String str) { - if (str.length() == 0) - return "''"; //TODO "NULL",too ? - final int len = str.length(); - StringBuffer sql = new StringBuffer(len * 2); - synchronized (sql) { //only for StringBuffer - sql.append('\''); - for (int i = 0; i < len; i++) { - char c = str.charAt(i); - switch (c) { - case '\u0000': - sql.append('\\').append('0'); - break; - case '\n': - sql.append('\\').append('n'); - break; - case '\r': - sql.append('\\').append('r'); - break; - case '\u001a': - sql.append('\\').append('Z'); - break; - case '"': - case '\'': - case '\\': - sql.append('\\'); - // fall through - default: - sql.append(c); - break; + + final int commitInterval = 1000; // Commit a transaction every n pages + int pageCount = 0; + + protected void checkpoint() throws IOException + { + pageCount++; + if (pageCount % commitInterval == 0) { + flushInsertBuffers(); + stream.writeStatement("COMMIT"); + stream.writeStatement("BEGIN"); } - } - sql.append('\''); - return sql.toString(); - } - } - - protected static String titleFormat(String title) { - return title.replace(' ', '_'); - } - - protected String timestampFormat(Calendar time) { - return traits.getTimestampFormatter().format(new Object[]{ - time.get(Calendar.YEAR), - time.get(Calendar.MONTH) + 1, - time.get(Calendar.DAY_OF_MONTH), - time.get(Calendar.HOUR_OF_DAY), - time.get(Calendar.MINUTE), - time.get(Calendar.SECOND)}); - } - - protected String inverseTimestamp(Calendar time) { - return traits.getTimestampFormatter().format(new Object[]{ - 9999 - time.get(Calendar.YEAR), - 99 - time.get(Calendar.MONTH) - 1, - 99 - time.get(Calendar.DAY_OF_MONTH), - 99 - time.get(Calendar.HOUR_OF_DAY), - 99 - time.get(Calendar.MINUTE), - 99 - time.get(Calendar.SECOND)}); - } - - private static final TimeZone utc = TimeZone.getTimeZone("UTC"); - - protected static GregorianCalendar now() { - return new GregorianCalendar(utc); - } - - final int commitInterval = 1000; // Commit a transaction every n pages - int pageCount = 0; - - protected void checkpoint() throws IOException { - pageCount++; - if (pageCount % commitInterval == 0) { - flushInsertBuffers(); - stream.writeStatement("COMMIT"); - stream.writeStatement("BEGIN"); } - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java index c854e1ce..36bc4b72 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter14.java @@ -27,68 +27,80 @@ import java.io.IOException; -public class SqlWriter14 extends SqlWriter { - private Page currentPage; - private Revision lastRevision; +public class SqlWriter14 + extends SqlWriter +{ + private Page currentPage; + private Revision lastRevision; - public SqlWriter14(SqlWriter.Traits tr, SqlStream output) { - super(tr, output); - } + public SqlWriter14(SqlWriter.Traits tr, SqlStream output) + { + super(tr, output); + } - public SqlWriter14(SqlWriter.Traits tr, SqlStream output, String prefix) { - super(tr, output, prefix); - } + public SqlWriter14(SqlWriter.Traits tr, SqlStream output, String prefix) + { + super(tr, output, prefix); + } - public void writeStartPage(Page page) { - currentPage = page; - lastRevision = null; - } + public void writeStartPage(Page page) + { + currentPage = page; + lastRevision = null; + } - public void writeEndPage() throws IOException { - if (lastRevision != null) - writeCurRevision(currentPage, lastRevision); - currentPage = null; - lastRevision = null; - } + public void writeEndPage() throws IOException + { + if (lastRevision != null) + writeCurRevision(currentPage, lastRevision); + currentPage = null; + lastRevision = null; + } - public void writeRevision(Revision revision) throws IOException { - if (lastRevision != null) - writeOldRevision(currentPage, lastRevision); - lastRevision = revision; - } + public void writeRevision(Revision revision) throws IOException + { + if (lastRevision != null) + writeOldRevision(currentPage, lastRevision); + lastRevision = revision; + } - private void writeOldRevision(Page page, Revision revision) throws IOException { - bufferInsertRow("old", new Object[][]{ - {"old_id", revision.Id}, - {"old_namespace", page.Title.Namespace}, - {"old_title", titleFormat(page.Title.Text)}, - {"old_text", revision.Text == null ? "" : revision.Text}, - {"old_comment", revision.Comment == null ? "" : revision.Comment}, - {"old_user", revision.Contributor.Username == null ? ZERO : revision.Contributor.Id}, - {"old_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, - {"old_timestamp", timestampFormat(revision.Timestamp)}, - {"old_minor_edit", revision.Minor ? ONE : ZERO}, - {"old_flags", "utf-8"}, - {"inverse_timestamp", inverseTimestamp(revision.Timestamp)}}); - } + private void writeOldRevision(Page page, Revision revision) throws IOException + { + bufferInsertRow("old", new Object[][] { { "old_id", revision.Id }, + { "old_namespace", page.Title.Namespace }, + { "old_title", titleFormat(page.Title.Text) }, + { "old_text", revision.Text == null ? "" : revision.Text }, + { "old_comment", revision.Comment == null ? "" : revision.Comment }, + { "old_user", + revision.Contributor.Username == null ? ZERO : revision.Contributor.Id }, + { "old_user_text", + revision.Contributor.Username == null ? "" + : revision.Contributor.Username }, + { "old_timestamp", timestampFormat(revision.Timestamp) }, + { "old_minor_edit", revision.Minor ? ONE : ZERO }, { "old_flags", "utf-8" }, + { "inverse_timestamp", inverseTimestamp(revision.Timestamp) } }); + } - private void writeCurRevision(Page page, Revision revision) throws IOException { - bufferInsertRow("cur", new Object[][]{ - {"cur_id", page.Id}, - {"cur_namespace", page.Title.Namespace}, - {"cur_title", titleFormat(page.Title.Text)}, - {"cur_text", revision.Text == null ? "" : revision.Text}, - {"cur_comment", revision.Comment == null ? "" : revision.Comment}, - {"cur_user", revision.Contributor.Username == null ? ZERO : Integer.valueOf(revision.Contributor.Id)}, - {"cur_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, - {"cur_timestamp", timestampFormat(revision.Timestamp)}, - {"cur_restrictions", page.Restrictions}, - {"cur_counter", ZERO}, - {"cur_is_redirect", revision.isRedirect() ? ONE : ZERO}, - {"cur_minor_edit", revision.Minor ? ONE : ZERO}, - {"cur_random", traits.getRandom()}, - {"cur_touched", traits.getCurrentTime()}, - {"inverse_timestamp", inverseTimestamp(revision.Timestamp)}}); - checkpoint(); - } + private void writeCurRevision(Page page, Revision revision) throws IOException + { + bufferInsertRow("cur", + new Object[][] { { "cur_id", page.Id }, { "cur_namespace", page.Title.Namespace }, + { "cur_title", titleFormat(page.Title.Text) }, + { "cur_text", revision.Text == null ? "" : revision.Text }, + { "cur_comment", revision.Comment == null ? "" : revision.Comment }, + { "cur_user", + revision.Contributor.Username == null ? ZERO + : Integer.valueOf(revision.Contributor.Id) }, + { "cur_user_text", + revision.Contributor.Username == null ? "" + : revision.Contributor.Username }, + { "cur_timestamp", timestampFormat(revision.Timestamp) }, + { "cur_restrictions", page.Restrictions }, { "cur_counter", ZERO }, + { "cur_is_redirect", revision.isRedirect() ? ONE : ZERO }, + { "cur_minor_edit", revision.Minor ? ONE : ZERO }, + { "cur_random", traits.getRandom() }, + { "cur_touched", traits.getCurrentTime() }, + { "inverse_timestamp", inverseTimestamp(revision.Timestamp) } }); + checkpoint(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java index 9e797ba1..19d8b013 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/SqlWriter15.java @@ -29,102 +29,114 @@ import java.io.IOException; -public class SqlWriter15 extends SqlWriter { - private Page currentPage; - private Revision lastRevision; - - public SqlWriter15(SqlWriter.Traits tr, SqlStream output) { - super(tr, output); - } - - public SqlWriter15(SqlWriter.Traits tr, SqlStream output, String prefix) { - super(tr, output, prefix); - } - - public void writeEndWiki() throws IOException { - flushInsertBuffers(); - super.writeEndWiki(); - } - - public void writeStartPage(Page page) { - currentPage = page; - lastRevision = null; - } - - public void writeEndPage() throws IOException { - if (lastRevision != null) { - updatePage(currentPage, lastRevision); +public class SqlWriter15 + extends SqlWriter +{ + private Page currentPage; + private Revision lastRevision; + + public SqlWriter15(SqlWriter.Traits tr, SqlStream output) + { + super(tr, output); } - currentPage = null; - lastRevision = null; - } - - static final int DELETED_TEXT = 1; - static final int DELETED_COMMENT = 2; - static final int DELETED_USER = 4; - static final int DELETED_RESTRICTED = 8; - - public void writeRevision(Revision revision) throws IOException { - bufferInsertRow(traits.getTextTable(), new Object[][]{ - {"old_id", revision.Id}, - {"old_text", revision.Text == null ? "" : revision.Text}, - {"old_flags", "utf-8"}}); - - int rev_deleted = 0; - if (revision.Contributor.Username == null) rev_deleted |= DELETED_USER; - if (revision.Comment == null) rev_deleted |= DELETED_COMMENT; - if (revision.Text == null) rev_deleted |= DELETED_TEXT; - - bufferInsertRow("revision", new Object[][]{ - {"rev_id", revision.Id}, - {"rev_page", currentPage.Id}, - {"rev_text_id", revision.Id}, - {"rev_comment", revision.Comment == null ? "" : revision.Comment}, - {"rev_user", revision.Contributor.Username == null ? ZERO : Integer.valueOf(revision.Contributor.Id)}, - {"rev_user_text", revision.Contributor.Username == null ? "" : revision.Contributor.Username}, - {"rev_timestamp", timestampFormat(revision.Timestamp)}, - {"rev_minor_edit", revision.Minor ? ONE : ZERO}, - {"rev_deleted", rev_deleted == 0 ? ZERO : Integer.valueOf(rev_deleted)}}); - - lastRevision = revision; - } - - private static int lengthUtf8(String s) { - final int slen = s.length(); - final char[] buf = Buffer.get(slen); - s.getChars(0, slen, buf, 0); - int len = 0; - for (int i = 0; i < slen; i++) { - char c = buf[i]; - if (c < 0x80) - len++; - else if (c < 0x800) - len += 2; - else if (c < 0xD800 || c >= 0xE000) - len += 3; - else { - // Surrogate pairs are assumed to be valid. - len += 4; - i++; - } + + public SqlWriter15(SqlWriter.Traits tr, SqlStream output, String prefix) + { + super(tr, output, prefix); + } + + public void writeEndWiki() throws IOException + { + flushInsertBuffers(); + super.writeEndWiki(); + } + + public void writeStartPage(Page page) + { + currentPage = page; + lastRevision = null; + } + + public void writeEndPage() throws IOException + { + if (lastRevision != null) { + updatePage(currentPage, lastRevision); + } + currentPage = null; + lastRevision = null; + } + + static final int DELETED_TEXT = 1; + static final int DELETED_COMMENT = 2; + static final int DELETED_USER = 4; + static final int DELETED_RESTRICTED = 8; + + public void writeRevision(Revision revision) throws IOException + { + bufferInsertRow(traits.getTextTable(), + new Object[][] { { "old_id", revision.Id }, + { "old_text", revision.Text == null ? "" : revision.Text }, + { "old_flags", "utf-8" } }); + + int rev_deleted = 0; + if (revision.Contributor.Username == null) + rev_deleted |= DELETED_USER; + if (revision.Comment == null) + rev_deleted |= DELETED_COMMENT; + if (revision.Text == null) + rev_deleted |= DELETED_TEXT; + + bufferInsertRow("revision", + new Object[][] { { "rev_id", revision.Id }, { "rev_page", currentPage.Id }, + { "rev_text_id", revision.Id }, + { "rev_comment", revision.Comment == null ? "" : revision.Comment }, + { "rev_user", + revision.Contributor.Username == null ? ZERO + : Integer.valueOf(revision.Contributor.Id) }, + { "rev_user_text", + revision.Contributor.Username == null ? "" + : revision.Contributor.Username }, + { "rev_timestamp", timestampFormat(revision.Timestamp) }, + { "rev_minor_edit", revision.Minor ? ONE : ZERO }, { "rev_deleted", + rev_deleted == 0 ? ZERO : Integer.valueOf(rev_deleted) } }); + + lastRevision = revision; + } + + private static int lengthUtf8(String s) + { + final int slen = s.length(); + final char[] buf = Buffer.get(slen); + s.getChars(0, slen, buf, 0); + int len = 0; + for (int i = 0; i < slen; i++) { + char c = buf[i]; + if (c < 0x80) + len++; + else if (c < 0x800) + len += 2; + else if (c < 0xD800 || c >= 0xE000) + len += 3; + else { + // Surrogate pairs are assumed to be valid. + len += 4; + i++; + } + } + return len; + } + + private void updatePage(Page page, Revision revision) throws IOException + { + bufferInsertRow("page", + new Object[][] { { "page_id", page.Id }, { "page_namespace", page.Title.Namespace }, + { "page_title", titleFormat(page.Title.Text) }, + { "page_restrictions", page.Restrictions }, { "page_counter", ZERO }, + { "page_is_redirect", revision.isRedirect() ? ONE : ZERO }, + { "page_is_new", ZERO }, { "page_random", traits.getRandom() }, + { "page_touched", traits.getCurrentTime() }, { "page_latest", revision.Id }, + { "page_len", lengthUtf8(revision.Text) } }); + checkpoint(); } - return len; - } - - private void updatePage(Page page, Revision revision) throws IOException { - bufferInsertRow("page", new Object[][]{ - {"page_id", page.Id}, - {"page_namespace", page.Title.Namespace}, - {"page_title", titleFormat(page.Title.Text)}, - {"page_restrictions", page.Restrictions}, - {"page_counter", ZERO}, - {"page_is_redirect", revision.isRedirect() ? ONE : ZERO}, - {"page_is_new", ZERO}, - {"page_random", traits.getRandom()}, - {"page_touched", traits.getCurrentTime()}, - {"page_latest", revision.Id}, - {"page_len", lengthUtf8(revision.Text)}}); - checkpoint(); - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java index c080734f..33a60a54 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TimeStampFilter.java @@ -29,50 +29,60 @@ import java.text.SimpleDateFormat; import java.util.Calendar; -public class TimeStampFilter implements DumpWriter { - final DumpWriter sink; - protected final Calendar filterTimeStamp; - protected Page currentPage; - protected boolean pageWritten; +public class TimeStampFilter + implements DumpWriter +{ + final DumpWriter sink; + protected final Calendar filterTimeStamp; + protected Page currentPage; + protected boolean pageWritten; - public TimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException { - this.sink = sink; - filterTimeStamp = Calendar.getInstance(); - filterTimeStamp.setTime(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").parse(timeStamp)); - } + public TimeStampFilter(DumpWriter sink, String timeStamp) throws ParseException + { + this.sink = sink; + filterTimeStamp = Calendar.getInstance(); + filterTimeStamp.setTime(new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'").parse(timeStamp)); + } - public void close() throws IOException { - sink.close(); - } + public void close() throws IOException + { + sink.close(); + } - public void writeStartWiki() throws IOException { - sink.writeStartWiki(); - } + public void writeStartWiki() throws IOException + { + sink.writeStartWiki(); + } - public void writeEndWiki() throws IOException { - sink.writeEndWiki(); - } + public void writeEndWiki() throws IOException + { + sink.writeEndWiki(); + } - public void writeSiteinfo(Siteinfo info) throws IOException { - sink.writeSiteinfo(info); - } + public void writeSiteinfo(Siteinfo info) throws IOException + { + sink.writeSiteinfo(info); + } - public void writeStartPage(Page page) throws IOException { - currentPage = page; - pageWritten = false; - } + public void writeStartPage(Page page) throws IOException + { + currentPage = page; + pageWritten = false; + } - public void writeEndPage() throws IOException { - if (pageWritten) { - sink.writeEndPage(); + public void writeEndPage() throws IOException + { + if (pageWritten) { + sink.writeEndPage(); + } } - } - public void writeRevision(Revision revision) throws IOException { - if (!pageWritten) { - sink.writeStartPage(currentPage); - pageWritten = true; + public void writeRevision(Revision revision) throws IOException + { + if (!pageWritten) { + sink.writeStartPage(currentPage); + pageWritten = true; + } + sink.writeRevision(revision); } - sink.writeRevision(revision); - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java index 9caf0c6d..ebf12718 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/Title.java @@ -25,81 +25,91 @@ package org.dkpro.jwpl.mwdumper.importer; -public class Title { - public final Integer Namespace; - public final String Text; +public class Title +{ + public final Integer Namespace; + public final String Text; - private final NamespaceSet namespaces; + private final NamespaceSet namespaces; - public Title(Integer namespaceKey, String text, NamespaceSet namespaces) { - this.namespaces = namespaces; - Namespace = namespaceKey; - Text = text; - } + public Title(Integer namespaceKey, String text, NamespaceSet namespaces) + { + this.namespaces = namespaces; + Namespace = namespaceKey; + Text = text; + } - public Title(String prefixedTitle, NamespaceSet namespaces) { - this.namespaces = namespaces; - int colon = prefixedTitle.indexOf(':'); - if (colon > 0) { - String prefix = prefixedTitle.substring(0, colon); - if (namespaces.hasPrefix(prefix)) { - Namespace = namespaces.getIndex(prefix); - Text = prefixedTitle.substring(colon + 1); - return; - } + public Title(String prefixedTitle, NamespaceSet namespaces) + { + this.namespaces = namespaces; + int colon = prefixedTitle.indexOf(':'); + if (colon > 0) { + String prefix = prefixedTitle.substring(0, colon); + if (namespaces.hasPrefix(prefix)) { + Namespace = namespaces.getIndex(prefix); + Text = prefixedTitle.substring(colon + 1); + return; + } + } + Namespace = 0; + Text = prefixedTitle; } - Namespace = 0; - Text = prefixedTitle; - } - public static String ValidateTitleChars(String text) { - // FIXME - return text; - } + public static String ValidateTitleChars(String text) + { + // FIXME + return text; + } - public String toString() { - String prefix = namespaces.getPrefix(Namespace); - if (Namespace == 0) - return prefix.concat(Text); - return prefix + ':' + Text; - } + public String toString() + { + String prefix = namespaces.getPrefix(Namespace); + if (Namespace == 0) + return prefix.concat(Text); + return prefix + ':' + Text; + } - public boolean isSpecial() { - return Namespace < 0; - } + public boolean isSpecial() + { + return Namespace < 0; + } - public boolean isTalk() { - return !isSpecial() && (Namespace.intValue() % 2 == 1); - } + public boolean isTalk() + { + return !isSpecial() && (Namespace.intValue() % 2 == 1); + } - public Title talkPage() { - if (isTalk()) - return this; - else if (isSpecial()) - return null; - else - return new Title(Namespace + 1, Text, namespaces); - } + public Title talkPage() + { + if (isTalk()) + return this; + else if (isSpecial()) + return null; + else + return new Title(Namespace + 1, Text, namespaces); + } - public Title subjectPage() { - if (isTalk()) - return new Title(Namespace - 1, Text, namespaces); - else - return this; - } + public Title subjectPage() + { + if (isTalk()) + return new Title(Namespace - 1, Text, namespaces); + else + return this; + } - public int hashCode() { - return Namespace.hashCode() ^ Text.hashCode(); - } + public int hashCode() + { + return Namespace.hashCode() ^ Text.hashCode(); + } - public boolean equals(Object other) { - if (other == this) - return true; - if (other instanceof Title) { - Title ot = (Title) other; - return Namespace.equals(ot.Namespace) && - Text.equals(ot.Text); + public boolean equals(Object other) + { + if (other == this) + return true; + if (other instanceof Title) { + Title ot = (Title) other; + return Namespace.equals(ot.Namespace) && Text.equals(ot.Text); + } + return false; } - return false; - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java index 65f24b02..95b9efdf 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/TitleMatchFilter.java @@ -27,15 +27,19 @@ import java.util.regex.Pattern; -public class TitleMatchFilter extends PageFilter { - final Pattern regex; +public class TitleMatchFilter + extends PageFilter +{ + final Pattern regex; - public TitleMatchFilter(DumpWriter sink, String regexString) { - super(sink); - regex = Pattern.compile(regexString); - } + public TitleMatchFilter(DumpWriter sink, String regexString) + { + super(sink); + regex = Pattern.compile(regexString); + } - protected boolean pass(Page page) { - return regex.matcher(page.Title.toString()).matches(); - } + protected boolean pass(Page page) + { + return regex.matcher(page.Title.toString()).matches(); + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java index 62dfa8bf..b7c85df3 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpReader.java @@ -42,368 +42,444 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -public class XmlDumpReader extends DefaultHandler { - InputStream input; - DumpWriter writer; - - private char[] buffer; - private int len; - private boolean hasContent; - private boolean deleted = false; - - Siteinfo siteinfo; - Page page; - boolean pageSent; - Contributor contrib; - Revision rev; - int nskey; - - boolean abortFlag; - - /** - * Initialize a processor for a MediaWiki XML dump stream. - * Events are sent to a single DumpWriter output sink, but you - * can chain multiple output processors with a MultiWriter. - * - * @param inputStream Stream to read XML from. - * @param writer Output sink to send processed events to. - */ - public XmlDumpReader(InputStream inputStream, DumpWriter writer) { - input = inputStream; - this.writer = writer; - buffer = new char[4096]; - len = 0; - hasContent = false; - } - - /** - * Reads through the entire XML dump on the input stream, sending - * events to the DumpWriter as it goes. May throw exceptions on - * invalid input or due to problems with the output. - * - * @throws IOException - */ - public void readDump() throws IOException { - try { - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, false); - SAXParser parser = factory.newSAXParser(); - - parser.parse(input, this); - } catch (ParserConfigurationException | SAXException e) { - throw (IOException) new IOException(e.getMessage()).initCause(e); - } - writer.close(); - } - - /** - * Request that the dump processing be aborted. - * At the next element, an exception will be thrown to stop the XML parser. - * FIXME Is setting a bool thread-safe? It should be atomic... - */ - public void abort() { - abortFlag = true; - } - - // -------------------------- - // SAX handler interface methods: - - private static final Map<String, String> startElements = new HashMap<>(64); - private static final Map<String, String> endElements = new HashMap<>(64); - - static { - startElements.put("revision", "revision"); - startElements.put("contributor", "contributor"); - startElements.put("page", "page"); - startElements.put("mediawiki", "mediawiki"); - startElements.put("siteinfo", "siteinfo"); - startElements.put("namespaces", "namespaces"); - startElements.put("namespace", "namespace"); - - endElements.put("ThreadSubject", "ThreadSubject"); - endElements.put("ThreadParent", "ThreadParent"); - endElements.put("ThreadAncestor", "ThreadAncestor"); - endElements.put("ThreadPage", "ThreadPage"); - endElements.put("ThreadID", "ThreadID"); - endElements.put("ThreadSummaryPage", "ThreadSummaryPage"); - endElements.put("ThreadAuthor", "ThreadAuthor"); - endElements.put("ThreadEditStatus", "ThreadEditStatus"); - endElements.put("ThreadType", "ThreadType"); - endElements.put("base", "base"); - endElements.put("case", "case"); - endElements.put("comment", "comment"); - endElements.put("contributor", "contributor"); - endElements.put("generator", "generator"); - endElements.put("id", "id"); - endElements.put("ip", "ip"); - endElements.put("mediawiki", "mediawiki"); - endElements.put("minor", "minor"); - endElements.put("namespaces", "namespaces"); - endElements.put("namespace", "namespace"); - endElements.put("page", "page"); - endElements.put("restrictions", "restrictions"); - endElements.put("revision", "revision"); - endElements.put("siteinfo", "siteinfo"); - endElements.put("sitename", "sitename"); - endElements.put("text", "text"); - endElements.put("timestamp", "timestamp"); - endElements.put("title", "title"); - endElements.put("username", "username"); - } - - public void startElement(String uri, String localname, String qName, Attributes attributes) throws SAXException { - // Clear the buffer for character data; we'll initialize it - // if and when character data arrives -- at that point we - // have a length. - len = 0; - hasContent = false; - - if (abortFlag) - throw new SAXException("XmlDumpReader set abort flag."); - - // check for deleted="deleted", and set deleted flag for the current element. - String d = attributes.getValue("deleted"); - deleted = (d != null && d.equals("deleted")); - - try { - qName = startElements.get(qName); - if (qName == null) - return; - // frequent tags: - if (qName == "revision") openRevision(); - else if (qName == "contributor") openContributor(); - else if (qName == "page") openPage(); - // rare tags: - else if (qName == "mediawiki") openMediaWiki(); - else if (qName == "siteinfo") openSiteinfo(); - else if (qName == "namespaces") openNamespaces(); - else if (qName == "namespace") openNamespace(attributes); - } catch (IOException e) { - throw new SAXException(e); - } - } - - public void characters(char[] ch, int start, int length) { - if (buffer.length < len + length) { - int maxlen = buffer.length * 2; - if (maxlen < len + length) - maxlen = len + length; - char[] tmp = new char[maxlen]; - System.arraycopy(buffer, 0, tmp, 0, len); - buffer = tmp; - } - System.arraycopy(ch, start, buffer, len, length); - len += length; - hasContent = true; - } - - public void endElement(String uri, String localname, String qName) throws SAXException { - try { - qName = endElements.get(qName); - if (qName == null) - return; - // frequent tags: - if (qName == "id") readId(); - else if (qName == "revision") closeRevision(); - else if (qName == "timestamp") readTimestamp(); - else if (qName == "text") readText(); - else if (qName == "contributor") closeContributor(); - else if (qName == "username") readUsername(); - else if (qName == "ip") readIp(); - else if (qName == "comment") readComment(); - else if (qName == "minor") readMinor(); - else if (qName == "page") closePage(); - else if (qName == "title") readTitle(); - else if (qName == "restrictions") readRestrictions(); - // rare tags: - else if (qName.startsWith("Thread")) threadAttribute(qName); - else if (qName == "mediawiki") closeMediaWiki(); - else if (qName == "siteinfo") closeSiteinfo(); - else if (qName == "sitename") readSitename(); - else if (qName == "base") readBase(); - else if (qName == "generator") readGenerator(); - else if (qName == "case") readCase(); - else if (qName == "namespaces") closeNamespaces(); - else if (qName == "namespace") closeNamespace(); -// else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring "+qName.length()+qName.substring(0,6)+")"); - } catch (IOException e) { - throw (SAXException) new SAXException(e.getMessage()).initCause(e); - } - } - - // ---------- - - void threadAttribute(String attrib) throws IOException { - if (attrib.equals("ThreadPage")) // parse title - page.DiscussionThreadingInfo.put(attrib, new Title(bufferContents(), siteinfo.Namespaces)); - else - page.DiscussionThreadingInfo.put(attrib, bufferContents()); - } - - void openMediaWiki() throws IOException { - siteinfo = null; - writer.writeStartWiki(); - } - - void closeMediaWiki() throws IOException { - writer.writeEndWiki(); - siteinfo = null; - } - - // ------------------ - - void openSiteinfo() { - siteinfo = new Siteinfo(); - } - - void closeSiteinfo() throws IOException { - writer.writeSiteinfo(siteinfo); - } - - private String bufferContentsOrNull() { - if (!hasContent) return null; - else return bufferContents(); - } - - private String bufferContents() { - return len == 0 ? "" : new String(buffer, 0, len); - } - - void readSitename() { - siteinfo.Sitename = bufferContents(); - } - - void readBase() { - siteinfo.Base = bufferContents(); - } - - void readGenerator() { - siteinfo.Generator = bufferContents(); - } - - void readCase() { - siteinfo.Case = bufferContents(); - } - - void openNamespaces() { - siteinfo.Namespaces = new NamespaceSet(); - } - - void openNamespace(Attributes attribs) { - nskey = Integer.parseInt(attribs.getValue("key")); - } - - void closeNamespace() { - siteinfo.Namespaces.add(nskey, bufferContents()); - } - - void closeNamespaces() { - // NOP - } - - // ----------- - - void openPage() { - page = new Page(); - pageSent = false; - } - - void closePage() throws IOException { - if (pageSent) - writer.writeEndPage(); - page = null; - } - - void readTitle() { - page.Title = new Title(bufferContents(), siteinfo.Namespaces); - } - - void readId() { - int id = Integer.parseInt(bufferContents()); - if (contrib != null) - contrib.Id = id; - else if (rev != null) - rev.Id = id; - else if (page != null) - page.Id = id; - else - throw new IllegalArgumentException("Unexpected <id> outside a <page>, <revision>, or <contributor>"); - } - - void readRestrictions() { - page.Restrictions = bufferContents(); - } - - // ------ - - void openRevision() throws IOException { - if (!pageSent) { - writer.writeStartPage(page); - pageSent = true; - } - - rev = new Revision(); - } - - void closeRevision() throws IOException { - writer.writeRevision(rev); - rev = null; - } - - void readTimestamp() { - rev.Timestamp = parseUTCTimestamp(bufferContents()); - } - - void readComment() { - rev.Comment = bufferContentsOrNull(); - if (rev.Comment == null && !deleted) rev.Comment = ""; //NOTE: null means deleted/supressed - } - - void readMinor() { - rev.Minor = true; - } - - void readText() { - rev.Text = bufferContentsOrNull(); - if (rev.Text == null && !deleted) rev.Text = ""; //NOTE: null means deleted/supressed - } - - // ----------- - void openContributor() { - //XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted" - contrib = new Contributor(); - } - - void closeContributor() { - //NOTE: if the contributor was supressed, nither username nor id have been set in the Contributor object - rev.Contributor = contrib; - contrib = null; - } - - - void readUsername() { - contrib.Username = bufferContentsOrNull(); - } - - void readIp() { - contrib.Username = bufferContents(); - contrib.isIP = true; - } - - private static final TimeZone utc = TimeZone.getTimeZone("UTC"); - - private static Calendar parseUTCTimestamp(String text) { - // 2003-10-26T04:50:47Z - // We're doing this manually for now, though DateFormatter might work... - String trimmed = text.trim(); - GregorianCalendar ts = new GregorianCalendar(utc); - ts.set( - Integer.parseInt(trimmed.substring(0, 0 + 4)), // year - Integer.parseInt(trimmed.substring(5, 5 + 2)) - 1, // month is 0-based! - Integer.parseInt(trimmed.substring(8, 8 + 2)), // day - Integer.parseInt(trimmed.substring(11, 11 + 2)), // hour - Integer.parseInt(trimmed.substring(14, 14 + 2)), // minute - Integer.parseInt(trimmed.substring(17, 17 + 2))); // second - return ts; - } +public class XmlDumpReader + extends DefaultHandler +{ + InputStream input; + DumpWriter writer; + + private char[] buffer; + private int len; + private boolean hasContent; + private boolean deleted = false; + + Siteinfo siteinfo; + Page page; + boolean pageSent; + Contributor contrib; + Revision rev; + int nskey; + + boolean abortFlag; + + /** + * Initialize a processor for a MediaWiki XML dump stream. Events are sent to a single + * DumpWriter output sink, but you can chain multiple output processors with a MultiWriter. + * + * @param inputStream + * Stream to read XML from. + * @param writer + * Output sink to send processed events to. + */ + public XmlDumpReader(InputStream inputStream, DumpWriter writer) + { + input = inputStream; + this.writer = writer; + buffer = new char[4096]; + len = 0; + hasContent = false; + } + + /** + * Reads through the entire XML dump on the input stream, sending events to the DumpWriter as it + * goes. May throw exceptions on invalid input or due to problems with the output. + * + * @throws IOException + */ + public void readDump() throws IOException + { + try { + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, false); + SAXParser parser = factory.newSAXParser(); + + parser.parse(input, this); + } + catch (ParserConfigurationException | SAXException e) { + throw (IOException) new IOException(e.getMessage()).initCause(e); + } + writer.close(); + } + + /** + * Request that the dump processing be aborted. At the next element, an exception will be thrown + * to stop the XML parser. FIXME Is setting a bool thread-safe? It should be atomic... + */ + public void abort() + { + abortFlag = true; + } + + // -------------------------- + // SAX handler interface methods: + + private static final Map<String, String> startElements = new HashMap<>(64); + private static final Map<String, String> endElements = new HashMap<>(64); + + static { + startElements.put("revision", "revision"); + startElements.put("contributor", "contributor"); + startElements.put("page", "page"); + startElements.put("mediawiki", "mediawiki"); + startElements.put("siteinfo", "siteinfo"); + startElements.put("namespaces", "namespaces"); + startElements.put("namespace", "namespace"); + + endElements.put("ThreadSubject", "ThreadSubject"); + endElements.put("ThreadParent", "ThreadParent"); + endElements.put("ThreadAncestor", "ThreadAncestor"); + endElements.put("ThreadPage", "ThreadPage"); + endElements.put("ThreadID", "ThreadID"); + endElements.put("ThreadSummaryPage", "ThreadSummaryPage"); + endElements.put("ThreadAuthor", "ThreadAuthor"); + endElements.put("ThreadEditStatus", "ThreadEditStatus"); + endElements.put("ThreadType", "ThreadType"); + endElements.put("base", "base"); + endElements.put("case", "case"); + endElements.put("comment", "comment"); + endElements.put("contributor", "contributor"); + endElements.put("generator", "generator"); + endElements.put("id", "id"); + endElements.put("ip", "ip"); + endElements.put("mediawiki", "mediawiki"); + endElements.put("minor", "minor"); + endElements.put("namespaces", "namespaces"); + endElements.put("namespace", "namespace"); + endElements.put("page", "page"); + endElements.put("restrictions", "restrictions"); + endElements.put("revision", "revision"); + endElements.put("siteinfo", "siteinfo"); + endElements.put("sitename", "sitename"); + endElements.put("text", "text"); + endElements.put("timestamp", "timestamp"); + endElements.put("title", "title"); + endElements.put("username", "username"); + } + + public void startElement(String uri, String localname, String qName, Attributes attributes) + throws SAXException + { + // Clear the buffer for character data; we'll initialize it + // if and when character data arrives -- at that point we + // have a length. + len = 0; + hasContent = false; + + if (abortFlag) + throw new SAXException("XmlDumpReader set abort flag."); + + // check for deleted="deleted", and set deleted flag for the current element. + String d = attributes.getValue("deleted"); + deleted = (d != null && d.equals("deleted")); + + try { + qName = startElements.get(qName); + if (qName == null) + return; + // frequent tags: + if (qName == "revision") + openRevision(); + else if (qName == "contributor") + openContributor(); + else if (qName == "page") + openPage(); + // rare tags: + else if (qName == "mediawiki") + openMediaWiki(); + else if (qName == "siteinfo") + openSiteinfo(); + else if (qName == "namespaces") + openNamespaces(); + else if (qName == "namespace") + openNamespace(attributes); + } + catch (IOException e) { + throw new SAXException(e); + } + } + + public void characters(char[] ch, int start, int length) + { + if (buffer.length < len + length) { + int maxlen = buffer.length * 2; + if (maxlen < len + length) + maxlen = len + length; + char[] tmp = new char[maxlen]; + System.arraycopy(buffer, 0, tmp, 0, len); + buffer = tmp; + } + System.arraycopy(ch, start, buffer, len, length); + len += length; + hasContent = true; + } + + public void endElement(String uri, String localname, String qName) throws SAXException + { + try { + qName = endElements.get(qName); + if (qName == null) + return; + // frequent tags: + if (qName == "id") + readId(); + else if (qName == "revision") + closeRevision(); + else if (qName == "timestamp") + readTimestamp(); + else if (qName == "text") + readText(); + else if (qName == "contributor") + closeContributor(); + else if (qName == "username") + readUsername(); + else if (qName == "ip") + readIp(); + else if (qName == "comment") + readComment(); + else if (qName == "minor") + readMinor(); + else if (qName == "page") + closePage(); + else if (qName == "title") + readTitle(); + else if (qName == "restrictions") + readRestrictions(); + // rare tags: + else if (qName.startsWith("Thread")) + threadAttribute(qName); + else if (qName == "mediawiki") + closeMediaWiki(); + else if (qName == "siteinfo") + closeSiteinfo(); + else if (qName == "sitename") + readSitename(); + else if (qName == "base") + readBase(); + else if (qName == "generator") + readGenerator(); + else if (qName == "case") + readCase(); + else if (qName == "namespaces") + closeNamespaces(); + else if (qName == "namespace") + closeNamespace(); + // else throw(SAXException)new SAXException("Unrecognised "+qName+"(substring + // "+qName.length()+qName.substring(0,6)+")"); + } + catch (IOException e) { + throw (SAXException) new SAXException(e.getMessage()).initCause(e); + } + } + + // ---------- + + void threadAttribute(String attrib) throws IOException + { + if (attrib.equals("ThreadPage")) // parse title + page.DiscussionThreadingInfo.put(attrib, + new Title(bufferContents(), siteinfo.Namespaces)); + else + page.DiscussionThreadingInfo.put(attrib, bufferContents()); + } + + void openMediaWiki() throws IOException + { + siteinfo = null; + writer.writeStartWiki(); + } + + void closeMediaWiki() throws IOException + { + writer.writeEndWiki(); + siteinfo = null; + } + + // ------------------ + + void openSiteinfo() + { + siteinfo = new Siteinfo(); + } + + void closeSiteinfo() throws IOException + { + writer.writeSiteinfo(siteinfo); + } + + private String bufferContentsOrNull() + { + if (!hasContent) + return null; + else + return bufferContents(); + } + + private String bufferContents() + { + return len == 0 ? "" : new String(buffer, 0, len); + } + + void readSitename() + { + siteinfo.Sitename = bufferContents(); + } + + void readBase() + { + siteinfo.Base = bufferContents(); + } + + void readGenerator() + { + siteinfo.Generator = bufferContents(); + } + + void readCase() + { + siteinfo.Case = bufferContents(); + } + + void openNamespaces() + { + siteinfo.Namespaces = new NamespaceSet(); + } + + void openNamespace(Attributes attribs) + { + nskey = Integer.parseInt(attribs.getValue("key")); + } + + void closeNamespace() + { + siteinfo.Namespaces.add(nskey, bufferContents()); + } + + void closeNamespaces() + { + // NOP + } + + // ----------- + + void openPage() + { + page = new Page(); + pageSent = false; + } + + void closePage() throws IOException + { + if (pageSent) + writer.writeEndPage(); + page = null; + } + + void readTitle() + { + page.Title = new Title(bufferContents(), siteinfo.Namespaces); + } + + void readId() + { + int id = Integer.parseInt(bufferContents()); + if (contrib != null) + contrib.Id = id; + else if (rev != null) + rev.Id = id; + else if (page != null) + page.Id = id; + else + throw new IllegalArgumentException( + "Unexpected <id> outside a <page>, <revision>, or <contributor>"); + } + + void readRestrictions() + { + page.Restrictions = bufferContents(); + } + + // ------ + + void openRevision() throws IOException + { + if (!pageSent) { + writer.writeStartPage(page); + pageSent = true; + } + + rev = new Revision(); + } + + void closeRevision() throws IOException + { + writer.writeRevision(rev); + rev = null; + } + + void readTimestamp() + { + rev.Timestamp = parseUTCTimestamp(bufferContents()); + } + + void readComment() + { + rev.Comment = bufferContentsOrNull(); + if (rev.Comment == null && !deleted) + rev.Comment = ""; // NOTE: null means deleted/supressed + } + + void readMinor() + { + rev.Minor = true; + } + + void readText() + { + rev.Text = bufferContentsOrNull(); + if (rev.Text == null && !deleted) + rev.Text = ""; // NOTE: null means deleted/supressed + } + + // ----------- + void openContributor() + { + // XXX: record deleted flag?! as it is, any empty <contributor> tag counts as "deleted" + contrib = new Contributor(); + } + + void closeContributor() + { + // NOTE: if the contributor was supressed, nither username nor id have been set in the + // Contributor object + rev.Contributor = contrib; + contrib = null; + } + + void readUsername() + { + contrib.Username = bufferContentsOrNull(); + } + + void readIp() + { + contrib.Username = bufferContents(); + contrib.isIP = true; + } + + private static final TimeZone utc = TimeZone.getTimeZone("UTC"); + + private static Calendar parseUTCTimestamp(String text) + { + // 2003-10-26T04:50:47Z + // We're doing this manually for now, though DateFormatter might work... + String trimmed = text.trim(); + GregorianCalendar ts = new GregorianCalendar(utc); + ts.set(Integer.parseInt(trimmed.substring(0, 0 + 4)), // year + Integer.parseInt(trimmed.substring(5, 5 + 2)) - 1, // month is 0-based! + Integer.parseInt(trimmed.substring(8, 8 + 2)), // day + Integer.parseInt(trimmed.substring(11, 11 + 2)), // hour + Integer.parseInt(trimmed.substring(14, 14 + 2)), // minute + Integer.parseInt(trimmed.substring(17, 17 + 2))); // second + return ts; + } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java index af1d7e74..b005d282 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlDumpWriter.java @@ -34,125 +34,142 @@ import java.util.Map; import java.util.TimeZone; -public class XmlDumpWriter implements DumpWriter { - protected OutputStream stream; - protected XmlWriter writer; - - protected static final String version = "0.3"; - protected static final String ns = "http://www.mediawiki.org/xml/export-" + version + "/"; - protected static final String schema = "http://www.mediawiki.org/xml/export-" + version + ".xsd"; - protected static final DateFormat dateFormat = new SimpleDateFormat("yyyy'-'MM'-'dd'T'HH':'mm':'ss'Z'"); - - static { - dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); - } - - public XmlDumpWriter(OutputStream output) { - stream = output; - writer = new XmlWriter(stream); - } - - public void close() throws IOException { - writer.close(); - } - - public void writeStartWiki() throws IOException { - writer.openXml(); - writer.openElement("mediawiki", new String[][]{ - {"xmlns", ns}, - {"xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance"}, - {"xsi:schemaLocation", ns + " " + schema}, - {"version", version}, - {"xml:lang", "en"}}); - // TODO: store and keep the xml:lang - } - - public void writeEndWiki() throws IOException { - writer.closeElement(); - writer.closeXml(); - } - - public void writeSiteinfo(Siteinfo info) throws IOException { - XmlWriter writer = this.writer; - writer.openElement("siteinfo"); - writer.textElement("sitename", info.Sitename); - writer.textElement("base", info.Base); - writer.textElement("generator", info.Generator); - writer.textElement("case", info.Case); - - writer.openElement("namespaces"); - for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i.hasNext(); ) { - Map.Entry<Integer, String> e = i.next(); - writer.textElement("namespace", e.getValue(), new String[][]{ - {"key", e.getKey().toString()}}); +public class XmlDumpWriter + implements DumpWriter +{ + protected OutputStream stream; + protected XmlWriter writer; + + protected static final String version = "0.3"; + protected static final String ns = "http://www.mediawiki.org/xml/export-" + version + "/"; + protected static final String schema = "http://www.mediawiki.org/xml/export-" + version + + ".xsd"; + protected static final DateFormat dateFormat = new SimpleDateFormat( + "yyyy'-'MM'-'dd'T'HH':'mm':'ss'Z'"); + + static { + dateFormat.setTimeZone(TimeZone.getTimeZone("UTC")); } - writer.closeElement(); - writer.closeElement(); - } + public XmlDumpWriter(OutputStream output) + { + stream = output; + writer = new XmlWriter(stream); + } - public void writeStartPage(Page page) throws IOException { - XmlWriter writer = this.writer; - writer.openElement("page"); - writer.textElement("title", page.Title.toString()); - if (page.Id != 0) - writer.textElement("id", Integer.toString(page.Id)); - if (page.Restrictions != null && page.Restrictions.length() != 0) - writer.textElement("restrictions", page.Restrictions); - } + public void close() throws IOException + { + writer.close(); + } - public void writeEndPage() throws IOException { - writer.closeElement(); - } + public void writeStartWiki() throws IOException + { + writer.openXml(); + writer.openElement("mediawiki", + new String[][] { { "xmlns", ns }, + { "xmlns:xsi", "http://www.w3.org/2001/XMLSchema-instance" }, + { "xsi:schemaLocation", ns + " " + schema }, { "version", version }, + { "xml:lang", "en" } }); + // TODO: store and keep the xml:lang + } - public void writeRevision(Revision rev) throws IOException { - XmlWriter writer = this.writer; - writer.openElement("revision"); - if (rev.Id != 0) - writer.textElement("id", Integer.toString(rev.Id)); + public void writeEndWiki() throws IOException + { + writer.closeElement(); + writer.closeXml(); + } - writer.textElement("timestamp", formatTimestamp(rev.Timestamp)); + public void writeSiteinfo(Siteinfo info) throws IOException + { + XmlWriter writer = this.writer; + writer.openElement("siteinfo"); + writer.textElement("sitename", info.Sitename); + writer.textElement("base", info.Base); + writer.textElement("generator", info.Generator); + writer.textElement("case", info.Case); + + writer.openElement("namespaces"); + for (Iterator<Map.Entry<Integer, String>> i = info.Namespaces.orderedEntries(); i + .hasNext();) { + Map.Entry<Integer, String> e = i.next(); + writer.textElement("namespace", e.getValue(), + new String[][] { { "key", e.getKey().toString() } }); + } + writer.closeElement(); + + writer.closeElement(); + } - writeContributor(rev.Contributor); + public void writeStartPage(Page page) throws IOException + { + XmlWriter writer = this.writer; + writer.openElement("page"); + writer.textElement("title", page.Title.toString()); + if (page.Id != 0) + writer.textElement("id", Integer.toString(page.Id)); + if (page.Restrictions != null && page.Restrictions.length() != 0) + writer.textElement("restrictions", page.Restrictions); + } - if (rev.Minor) { - writer.emptyElement("minor"); + public void writeEndPage() throws IOException + { + writer.closeElement(); } - if (rev.Comment == null) { - writer.emptyElement("comment", deletedAttrib); - } else if (rev.Comment.length() != 0) { - writer.textElement("comment", rev.Comment); + public void writeRevision(Revision rev) throws IOException + { + XmlWriter writer = this.writer; + writer.openElement("revision"); + if (rev.Id != 0) + writer.textElement("id", Integer.toString(rev.Id)); + + writer.textElement("timestamp", formatTimestamp(rev.Timestamp)); + + writeContributor(rev.Contributor); + + if (rev.Minor) { + writer.emptyElement("minor"); + } + + if (rev.Comment == null) { + writer.emptyElement("comment", deletedAttrib); + } + else if (rev.Comment.length() != 0) { + writer.textElement("comment", rev.Comment); + } + + writer.textElement("text", rev.Text, + rev.Text == null + ? new String[][] { { "xml:space", "preserve" }, { "deleted", "deleted" } } + : new String[][] { { "xml:space", "preserve" } }); + + writer.closeElement(); + } + + static final String[][] deletedAttrib = new String[][] { { "deleted", "deleted" } }; + + static String formatTimestamp(Calendar ts) + { + return dateFormat.format(ts.getTime()); } - writer.textElement("text", rev.Text, - rev.Text == null ? new String[][]{{"xml:space", "preserve"}, {"deleted", "deleted"}} - : new String[][]{{"xml:space", "preserve"}} - ); - - writer.closeElement(); - } - - static final String[][] deletedAttrib = new String[][]{{"deleted", "deleted"}}; - - static String formatTimestamp(Calendar ts) { - return dateFormat.format(ts.getTime()); - } - - void writeContributor(Contributor contrib) throws IOException { - XmlWriter writer = this.writer; - - if (contrib.Username == null) { - writer.emptyElement("contributor", deletedAttrib); - } else { - writer.openElement("contributor"); - if (contrib.isIP) { - writer.textElement("ip", contrib.Username); - } else { - writer.textElement("username", contrib.Username); - writer.textElement("id", Integer.toString(contrib.Id)); - } - writer.closeElement(); + void writeContributor(Contributor contrib) throws IOException + { + XmlWriter writer = this.writer; + + if (contrib.Username == null) { + writer.emptyElement("contributor", deletedAttrib); + } + else { + writer.openElement("contributor"); + if (contrib.isIP) { + writer.textElement("ip", contrib.Username); + } + else { + writer.textElement("username", contrib.Username); + writer.textElement("id", Integer.toString(contrib.Id)); + } + writer.closeElement(); + } } - } } diff --git a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java index 765c6c93..06f61717 100644 --- a/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java +++ b/dkpro-jwpl-mwdumper/src/main/java/org/dkpro/jwpl/mwdumper/importer/XmlWriter.java @@ -26,183 +26,208 @@ import java.util.List; /** - * Quickie little class for sending properly encoded, prettily - * indented XML output to a stream. There is no namespace support, - * so prefixes and xmlns attributes must be managed manually. + * Quickie little class for sending properly encoded, prettily indented XML output to a stream. + * There is no namespace support, so prefixes and xmlns attributes must be managed manually. */ -public class XmlWriter { - private final String encoding; - private final List<String> stack; - private final BufferedWriter writer; - - public XmlWriter(OutputStream stream) { - encoding = "utf-8"; - stack = new ArrayList<>(); - writer = new BufferedWriter(new OutputStreamWriter(stream, StandardCharsets.UTF_8)); - } - - /** - * @throws IOException Thrown if IO errors occurred. - */ - public void close() throws IOException { - writer.flush(); - writer.close(); - } - - /** - * Write the <?xml?> header. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void openXml() throws IOException { - writeRaw("<?xml version=\"1.0\" encoding=\"" + encoding + "\" ?>\n"); - } - - /** - * In theory, we might close out open elements or such. - */ - public void closeXml() { - } - - - /** - * Write an empty element, such as <el/>, on a standalone line. - * Takes an optional dictionary of attributes. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void emptyElement(String element) throws IOException { - emptyElement(element, null); - } - - public void emptyElement(String element, String[][] attributes) throws IOException { - startElement(element, attributes, "/>\n"); - deIndent(); - } - - /** - * Write an element open tag, such as <el/>, on a standalone line. - * Takes an optional dictionary of attributes. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void openElement(String element) throws IOException { - openElement(element, null); - } - - public void openElement(String element, String[][] attributes) throws IOException { - startElement(element, attributes, ">\n"); - } - - /** - * Write an element close tag, such as <el/>, on a standalone line. - * If indent=False is passed, indentation will not be added. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void closeElement() throws IOException { - closeElement(true); - } - - public void closeElement(boolean indent) throws IOException { - String[] bits = deIndent(); - String element = bits[0]; - String space = bits[1]; - if (indent) - writeRaw(space + "</" + element + ">\n"); - else - writeRaw("</" + element + ">\n"); - } - - /** - * Write an element with a text node included, such as <el/>foo<el/>, - * on a standalone line. If the text is empty, an empty element will - * be output as <el/>. Takes an optional list of tuples with attribute - * names and values. - * - * @throws IOException Thrown if IO errors occurred. - */ - public void textElement(String element, String text) throws IOException { - textElement(element, text, null); - } - - public void textElement(String element, String text, String[][] attributes) throws IOException { - if (text == null || text.length() == 0) { - emptyElement(element, attributes); - } else { - startElement(element, attributes, ">"); - writeEscaped(text); - closeElement(false); - } - } - - void startElement(String element, String[][] attributes, String terminator) throws IOException { - writeRaw(indent(element)); - writeRaw('<'); - writeRaw(element); - if (attributes != null) { - for (int i = 0; i < attributes.length; i++) { - writeRaw(' '); - writeRaw(attributes[i][0]); - writeRaw("=\""); - writeEscaped(attributes[i][1]); - writeRaw('"'); - } - } - writeRaw(terminator); - } - - /** - * Send an encoded Unicode string to the output stream. - * - * @throws IOException Thrown if IO errors occurred. - */ - void writeRaw(String data) throws IOException { - writer.write(data); - } - - void writeRaw(char c) throws IOException { - writer.write(c); - } - - void writeEscaped(String data) throws IOException { - int end = data.length(); - for (int i = 0; i < end; i++) { - char c = data.charAt(i); - switch (c) { - case '&': - writer.write("&"); - break; - case '<': - writer.write("<"); - break; - case '>': - writer.write(">"); - break; - case '"': - writer.write("""); - break; - default: - writer.write(c); - } - } - } - - private String indent(String element) { - int level = stack.size(); - stack.add(element); - return spaces(level); - } - - private String[] deIndent() { - String element = stack.remove(stack.size() - 1); - String space = spaces(stack.size()); - return new String[]{element, space}; - } - - private String spaces(int level) { - StringBuilder buffer = new StringBuilder(); - buffer.append(" ".repeat(Math.max(0, level * 2))); - return buffer.toString(); - } +public class XmlWriter +{ + private final String encoding; + private final List<String> stack; + private final BufferedWriter writer; + + public XmlWriter(OutputStream stream) + { + encoding = "utf-8"; + stack = new ArrayList<>(); + writer = new BufferedWriter(new OutputStreamWriter(stream, StandardCharsets.UTF_8)); + } + + /** + * @throws IOException + * Thrown if IO errors occurred. + */ + public void close() throws IOException + { + writer.flush(); + writer.close(); + } + + /** + * Write the <?xml?> header. + * + * @throws IOException + * Thrown if IO errors occurred. + */ + public void openXml() throws IOException + { + writeRaw("<?xml version=\"1.0\" encoding=\"" + encoding + "\" ?>\n"); + } + + /** + * In theory, we might close out open elements or such. + */ + public void closeXml() + { + } + + /** + * Write an empty element, such as <el/>, on a standalone line. Takes an optional + * dictionary of attributes. + * + * @throws IOException + * Thrown if IO errors occurred. + */ + public void emptyElement(String element) throws IOException + { + emptyElement(element, null); + } + + public void emptyElement(String element, String[][] attributes) throws IOException + { + startElement(element, attributes, "/>\n"); + deIndent(); + } + + /** + * Write an element open tag, such as <el/>, on a standalone line. Takes an optional + * dictionary of attributes. + * + * @throws IOException + * Thrown if IO errors occurred. + */ + public void openElement(String element) throws IOException + { + openElement(element, null); + } + + public void openElement(String element, String[][] attributes) throws IOException + { + startElement(element, attributes, ">\n"); + } + + /** + * Write an element close tag, such as <el/>, on a standalone line. If indent=False is + * passed, indentation will not be added. + * + * @throws IOException + * Thrown if IO errors occurred. + */ + public void closeElement() throws IOException + { + closeElement(true); + } + + public void closeElement(boolean indent) throws IOException + { + String[] bits = deIndent(); + String element = bits[0]; + String space = bits[1]; + if (indent) + writeRaw(space + "</" + element + ">\n"); + else + writeRaw("</" + element + ">\n"); + } + + /** + * Write an element with a text node included, such as <el/>foo<el/>, on a + * standalone line. If the text is empty, an empty element will be output as <el/>. Takes + * an optional list of tuples with attribute names and values. + * + * @throws IOException + * Thrown if IO errors occurred. + */ + public void textElement(String element, String text) throws IOException + { + textElement(element, text, null); + } + + public void textElement(String element, String text, String[][] attributes) throws IOException + { + if (text == null || text.length() == 0) { + emptyElement(element, attributes); + } + else { + startElement(element, attributes, ">"); + writeEscaped(text); + closeElement(false); + } + } + + void startElement(String element, String[][] attributes, String terminator) throws IOException + { + writeRaw(indent(element)); + writeRaw('<'); + writeRaw(element); + if (attributes != null) { + for (int i = 0; i < attributes.length; i++) { + writeRaw(' '); + writeRaw(attributes[i][0]); + writeRaw("=\""); + writeEscaped(attributes[i][1]); + writeRaw('"'); + } + } + writeRaw(terminator); + } + + /** + * Send an encoded Unicode string to the output stream. + * + * @throws IOException + * Thrown if IO errors occurred. + */ + void writeRaw(String data) throws IOException + { + writer.write(data); + } + + void writeRaw(char c) throws IOException + { + writer.write(c); + } + + void writeEscaped(String data) throws IOException + { + int end = data.length(); + for (int i = 0; i < end; i++) { + char c = data.charAt(i); + switch (c) { + case '&': + writer.write("&"); + break; + case '<': + writer.write("<"); + break; + case '>': + writer.write(">"); + break; + case '"': + writer.write("""); + break; + default: + writer.write(c); + } + } + } + + private String indent(String element) + { + int level = stack.size(); + stack.add(element); + return spaces(level); + } + + private String[] deIndent() + { + String element = stack.remove(stack.size() - 1); + String space = spaces(stack.size()); + return new String[] { element, space }; + } + + private String spaces(int level) + { + StringBuilder buffer = new StringBuilder(); + buffer.append(" ".repeat(Math.max(0, level * 2))); + return buffer.toString(); + } } diff --git a/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java b/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java index b8bc6c4f..e5626f7a 100644 --- a/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java +++ b/dkpro-jwpl-mwdumper/src/test/java/org/dkpro/jwpl/mwdumper/importer/TitleTest.java @@ -24,7 +24,6 @@ * $Id: TitleTest.java 11268 2005-10-10 06:57:30Z vibber $ */ - import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotEquals; @@ -35,179 +34,193 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -public class TitleTest { - NamespaceSet namespaces; - - @BeforeEach - protected void setUp() { - namespaces = new NamespaceSet(); - namespaces.add(-2, "Media"); - namespaces.add(-1, "Special"); - namespaces.add(0, ""); - namespaces.add(1, "Talk"); - namespaces.add(2, "User"); - namespaces.add(3, "User talk"); - namespaces.add(4, "Project"); - namespaces.add(5, "Project talk"); - namespaces.add(6, "Image"); - namespaces.add(7, "Image talk"); - namespaces.add(8, "MediaWiki"); - namespaces.add(9, "MediaWiki talk"); - namespaces.add(10, "Template"); - namespaces.add(11, "Template talk"); - namespaces.add(12, "Help"); - namespaces.add(13, "Help talk"); - namespaces.add(14, "Category"); - namespaces.add(15, "Category talk"); - } - - @AfterEach - protected void tearDown() { - namespaces = null; - } - - private class TestItem { - public final int ns; - public final String text; - public final String prefixed; - TestItem(int ns, String text, String prefixed) { - this.ns = ns; - this.text = text; - this.prefixed = prefixed; - } - @Override - public String toString() { - return "(" + ns + ",\"" + text + "\") [[" + prefixed + "]]"; - } - } - - final TestItem[] tests = { - new TestItem(0, "Page title", "Page title"), - new TestItem(1, "Page title", "Talk:Page title"), - new TestItem(-1, "Recentchanges", "Special:Recentchanges"), - new TestItem(13, "Logging in", "Help talk:Logging in"), - new TestItem(0, "2001: A Space Odyssey", "2001: A Space Odyssey"), - new TestItem(0, "2:2", "2:2") - }; - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.Title(int, String, NamespaceSet)' - */ - @Test - public void testTitleIntStringNamespaceSet() { - for (TestItem item : tests) { - Title title = new Title(item.ns, item.text, namespaces); - assertEquals(item.prefixed, title.toString(), item.toString()); - } - } - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.Title(String, NamespaceSet)' - */ - @Test - public void testTitleStringNamespaceSet() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - assertEquals(item.ns, title.Namespace.intValue(), item.toString()); - assertEquals(item.text, title.Text, item.toString()); - } - } - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.toString()' - */ - @Test - public void testToString() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - assertEquals(item.prefixed, title.toString(), item.toString()); - } - } - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.isSpecial()' - */ - @Test - public void testIsSpecial() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - if (item.ns < 0) { - assertTrue(title.isSpecial(), item.toString()); - } - else { - assertFalse(title.isSpecial(), item.toString()); - } - } - } - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.isTalk()' - */ - @Test - public void testIsTalk() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - if (title.isSpecial()) { - assertFalse(title.isTalk(), item.toString()); - } - else if (item.ns % 2 == 0) { - assertFalse(title.isTalk(), item.toString()); - } - else { - assertTrue(title.isTalk(), item.toString()); - } - } - } - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.talkPage()' - */ - @Test - public void testTalkPage() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - if (title.isTalk()) { - assertEquals(title, title.talkPage(), item.toString()); - } - else if (title.isSpecial()) { - assertNull(title.talkPage(), item.toString()); - } - else { - assertNotEquals(title, title.talkPage(), item.toString()); - } - } - } - - /* - * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.subjectPage()' - */ - @Test - public void testSubjectPage() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - if (title.isTalk()) { - assertNotEquals(title, title.subjectPage(), item.toString()); - } - else { - assertEquals(title, title.subjectPage(), item.toString()); - } - } - } - - @Test - public void testTalkSubjectPage() { - for (TestItem item : tests) { - Title title = new Title(item.prefixed, namespaces); - if (title.isTalk()) { - assertEquals( title, title.subjectPage().talkPage(), item.toString()); - } - else if (title.isSpecial()) { - assertNull(title.subjectPage().talkPage(), item.toString()); - } - else { - assertEquals(title, title.talkPage().subjectPage(), item.toString()); - } - } - } +public class TitleTest +{ + NamespaceSet namespaces; + + @BeforeEach + protected void setUp() + { + namespaces = new NamespaceSet(); + namespaces.add(-2, "Media"); + namespaces.add(-1, "Special"); + namespaces.add(0, ""); + namespaces.add(1, "Talk"); + namespaces.add(2, "User"); + namespaces.add(3, "User talk"); + namespaces.add(4, "Project"); + namespaces.add(5, "Project talk"); + namespaces.add(6, "Image"); + namespaces.add(7, "Image talk"); + namespaces.add(8, "MediaWiki"); + namespaces.add(9, "MediaWiki talk"); + namespaces.add(10, "Template"); + namespaces.add(11, "Template talk"); + namespaces.add(12, "Help"); + namespaces.add(13, "Help talk"); + namespaces.add(14, "Category"); + namespaces.add(15, "Category talk"); + } + + @AfterEach + protected void tearDown() + { + namespaces = null; + } + + private class TestItem + { + public final int ns; + public final String text; + public final String prefixed; + + TestItem(int ns, String text, String prefixed) + { + this.ns = ns; + this.text = text; + this.prefixed = prefixed; + } + + @Override + public String toString() + { + return "(" + ns + ",\"" + text + "\") [[" + prefixed + "]]"; + } + } + + final TestItem[] tests = { new TestItem(0, "Page title", "Page title"), + new TestItem(1, "Page title", "Talk:Page title"), + new TestItem(-1, "Recentchanges", "Special:Recentchanges"), + new TestItem(13, "Logging in", "Help talk:Logging in"), + new TestItem(0, "2001: A Space Odyssey", "2001: A Space Odyssey"), + new TestItem(0, "2:2", "2:2") }; + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.Title(int, String, NamespaceSet)' + */ + @Test + public void testTitleIntStringNamespaceSet() + { + for (TestItem item : tests) { + Title title = new Title(item.ns, item.text, namespaces); + assertEquals(item.prefixed, title.toString(), item.toString()); + } + } + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.Title(String, NamespaceSet)' + */ + @Test + public void testTitleStringNamespaceSet() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + assertEquals(item.ns, title.Namespace.intValue(), item.toString()); + assertEquals(item.text, title.Text, item.toString()); + } + } + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.toString()' + */ + @Test + public void testToString() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + assertEquals(item.prefixed, title.toString(), item.toString()); + } + } + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.isSpecial()' + */ + @Test + public void testIsSpecial() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + if (item.ns < 0) { + assertTrue(title.isSpecial(), item.toString()); + } + else { + assertFalse(title.isSpecial(), item.toString()); + } + } + } + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.isTalk()' + */ + @Test + public void testIsTalk() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + if (title.isSpecial()) { + assertFalse(title.isTalk(), item.toString()); + } + else if (item.ns % 2 == 0) { + assertFalse(title.isTalk(), item.toString()); + } + else { + assertTrue(title.isTalk(), item.toString()); + } + } + } + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.talkPage()' + */ + @Test + public void testTalkPage() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + if (title.isTalk()) { + assertEquals(title, title.talkPage(), item.toString()); + } + else if (title.isSpecial()) { + assertNull(title.talkPage(), item.toString()); + } + else { + assertNotEquals(title, title.talkPage(), item.toString()); + } + } + } + + /* + * Test method for 'org.dkpro.jwpl.mwdumper.importer.Title.subjectPage()' + */ + @Test + public void testSubjectPage() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + if (title.isTalk()) { + assertNotEquals(title, title.subjectPage(), item.toString()); + } + else { + assertEquals(title, title.subjectPage(), item.toString()); + } + } + } + + @Test + public void testTalkSubjectPage() + { + for (TestItem item : tests) { + Title title = new Title(item.prefixed, namespaces); + if (title.isTalk()) { + assertEquals(title, title.subjectPage().talkPage(), item.toString()); + } + else if (title.isSpecial()) { + assertNull(title.subjectPage().talkPage(), item.toString()); + } + else { + assertEquals(title, title.talkPage().subjectPage(), item.toString()); + } + } + } } From 56470b55a534b0ca34a0a2d63c384c13f3f7d1b5 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho <richard.eckart@gmail.com> Date: Tue, 31 Oct 2023 14:26:38 +0100 Subject: [PATCH 08/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-parser --- .../java/org/dkpro/jwpl/parser/Content.java | 166 +- .../dkpro/jwpl/parser/ContentContainer.java | 452 +-- .../org/dkpro/jwpl/parser/ContentElement.java | 619 ++-- .../org/dkpro/jwpl/parser/DefinitionList.java | 170 +- .../main/java/org/dkpro/jwpl/parser/Link.java | 247 +- .../jwpl/parser/LinkAnchorExtractor.java | 160 +- .../org/dkpro/jwpl/parser/NestedList.java | 10 +- .../jwpl/parser/NestedListContainer.java | 93 +- .../dkpro/jwpl/parser/NestedListElement.java | 12 +- .../java/org/dkpro/jwpl/parser/Paragraph.java | 68 +- .../org/dkpro/jwpl/parser/ParsedPage.java | 815 ++-- .../dkpro/jwpl/parser/ParsedPageObject.java | 30 +- .../java/org/dkpro/jwpl/parser/Section.java | 278 +- .../dkpro/jwpl/parser/SectionContainer.java | 369 +- .../org/dkpro/jwpl/parser/SectionContent.java | 317 +- .../main/java/org/dkpro/jwpl/parser/Span.java | 334 +- .../java/org/dkpro/jwpl/parser/SrcSpan.java | 85 +- .../java/org/dkpro/jwpl/parser/Table.java | 333 +- .../org/dkpro/jwpl/parser/TableElement.java | 219 +- .../java/org/dkpro/jwpl/parser/Template.java | 98 +- .../dkpro/jwpl/parser/html/HtmlWriter.java | 746 ++-- .../dkpro/jwpl/parser/html/ParsedPageCSS.java | 185 +- .../mediawiki/EmptyStructureRemover.java | 242 +- .../jwpl/parser/mediawiki/FlushTemplates.java | 26 +- .../mediawiki/GermanTemplateParser.java | 214 +- .../MediaWikiContentElementParser.java | 15 +- .../parser/mediawiki/MediaWikiParser.java | 32 +- .../mediawiki/MediaWikiParserFactory.java | 1220 +++--- .../mediawiki/MediaWikiTemplateParser.java | 30 +- .../jwpl/parser/mediawiki/ModularParser.java | 3280 +++++++++-------- .../parser/mediawiki/ParserConstants.java | 11 +- .../parser/mediawiki/ResolvedTemplate.java | 164 +- .../ShowTemplateNamesAndParameters.java | 52 +- .../jwpl/parser/mediawiki/SpanManager.java | 467 +-- .../parser/mediawiki/SrcPosRangeChecker.java | 210 +- .../parser/selectiveaccess/ConfigLoader.java | 184 +- .../SelectiveAccessHandler.java | 713 ++-- .../org/dkpro/jwpl/parser/BaseJWPLTest.java | 33 +- .../org/dkpro/jwpl/parser/ParsedPageTest.java | 37 +- 39 files changed, 6722 insertions(+), 6014 deletions(-) diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java index 168db2c7..0a961d8c 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Content.java @@ -25,108 +25,110 @@ * Be aware, that all retured Spans refer to the String returned by getText()<br> * this is true for any implementing class!<br> */ -public interface Content { +public interface Content +{ + + enum FormatType + { + /** + * Bold Text + */ + BOLD, + /** + * Italic Text + */ + ITALIC, + /** + * The Content between Math Tags + */ + MATH, + /** + * The Content between NoWiki Tags + */ + NOWIKI, + /** + * The begin and end position of an unknown Tag defined by < and > + */ + TAG, + } - enum FormatType { /** - * Bold Text + * Returns the Text of the Element */ - BOLD, + String getText(); + /** - * Italic Text + * Content.getText().length() == Content.length() */ - ITALIC, + int length(); + /** - * The Content between Math Tags + * Returns true, if there is no content in the element. */ - MATH, + boolean empty(); + /** - * The Content between NoWiki Tags + * returns the Format Spans of the Specified Type. */ - NOWIKI, + List<Span> getFormatSpans(FormatType t); + /** - * The begin and end position of an unknown Tag defined by < and > + * returns the Format Spans of the Specified Type, in the Range from start to end. */ - TAG, - } - - /** - * Returns the Text of the Element - */ - String getText(); - - /** - * Content.getText().length() == Content.length() - */ - int length(); - - /** - * Returns true, if there is no content in the element. - */ - boolean empty(); - - /** - * returns the Format Spans of the Specified Type. - */ - List<Span> getFormatSpans(FormatType t); + List<Span> getFormatSpans(FormatType t, int start, int end); - /** - * returns the Format Spans of the Specified Type, in the Range from start to end. - */ - List<Span> getFormatSpans(FormatType t, int start, int end); - - /** - * returns the Format Spans of the Specified Type, in the Range of s. - */ - List<Span> getFormatSpans(FormatType t, Span s); + /** + * returns the Format Spans of the Specified Type, in the Range of s. + */ + List<Span> getFormatSpans(FormatType t, Span s); - /** - * returns the Formats uses in this element. - */ - List<FormatType> getFormats(); + /** + * returns the Formats uses in this element. + */ + List<FormatType> getFormats(); - /** - * returns the Formats uses in this element, in the Range from start to end. - */ - List<FormatType> getFormats(int start, int end); + /** + * returns the Formats uses in this element, in the Range from start to end. + */ + List<FormatType> getFormats(int start, int end); - /** - * returns the Formats uses in this element, in the Range of s. - */ - List<FormatType> getFormats(Span s); + /** + * returns the Formats uses in this element, in the Range of s. + */ + List<FormatType> getFormats(Span s); - /** - * returns all Links of this element. - */ - List<Link> getLinks(); + /** + * returns all Links of this element. + */ + List<Link> getLinks(); - /** - * returns all Links of this element of the specified type. - */ - List<Link> getLinks(Link.type t); + /** + * returns all Links of this element of the specified type. + */ + List<Link> getLinks(Link.type t); - /** - * returns all Links of this element of the specified type, in the Range from start to end. - */ - List<Link> getLinks(Link.type t, int start, int end); + /** + * returns all Links of this element of the specified type, in the Range from start to end. + */ + List<Link> getLinks(Link.type t, int start, int end); - /** - * returns all Links of this element of the specified type, in the Range of s - */ - List<Link> getLinks(Link.type t, Span s); + /** + * returns all Links of this element of the specified type, in the Range of s + */ + List<Link> getLinks(Link.type t, Span s); - /** - * returns all Templates. - */ - List<Template> getTemplates(); + /** + * returns all Templates. + */ + List<Template> getTemplates(); - /** - * returns all Templates, in the Range from start to end. - */ - List<Template> getTemplates(int start, int end); + /** + * returns all Templates, in the Range from start to end. + */ + List<Template> getTemplates(int start, int end); - /** - * returns all Templates, in the Range of s. - */ - List<Template> getTemplates(Span s); + /** + * returns all Templates, in the Range of s. + */ + List<Template> getTemplates(Span s); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java index 6ecd3877..9a39ee85 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentContainer.java @@ -23,239 +23,277 @@ /** * A ContentContainer is used to combine more than one Content element (not only * ContentElement.class!) in a new Content element.<br> - * For a description of the Functions of the Content Interface, take a look at - * the Content.class documentation.<br> + * For a description of the Functions of the Content Interface, take a look at the Content.class + * documentation.<br> */ -public abstract class ContentContainer extends ParsedPageObject implements Content { +public abstract class ContentContainer + extends ParsedPageObject + implements Content +{ - protected List<Content> ccl; + protected List<Content> ccl; - public boolean empty() { - return ccl.size() == 0; - } + public boolean empty() + { + return ccl.size() == 0; + } + + public String getText() + { - public String getText() { + StringBuilder result = new StringBuilder(); + for (Content cc : ccl) { + if (cc != null) + result.append(cc.getText() + " "); + } - StringBuilder result = new StringBuilder(); - for (Content cc : ccl) { - if (cc != null) result.append(cc.getText() + " "); + final int temp = result.length() - 1; + if (temp >= 0) + result.deleteCharAt(temp); + + return result.toString(); } - final int temp = result.length() - 1; - if (temp >= 0) result.deleteCharAt(temp); - - return result.toString(); - } - - /** - * Returns the Text in the Span List in a String...<br> - * all Spans must refer to the text returned by getText(). - */ - public String getText(List<Span> sl) { - final String temp = getText(); - StringBuilder result = new StringBuilder(); - for (Span s : sl) - result.append(s.getText(temp) + ' '); - result.deleteCharAt(result.length() - 1); - return result.toString(); - } - - public int length() { - int length = 0; - - for (Content cc : ccl) - if (cc != null) length += cc.length() + 1; - - if (length > 0) length--; - - return length; - } - - /** - * Retruns the Number of Content elements in this ContentContainer. - */ - public int size() { - return ccl.size(); - } - - public List<Span> getFormatSpans(FormatType t) { - List<Span> result = new ArrayList<>(); - int offset = 0; - for (Content c : ccl) { - for (Span b : c.getFormatSpans(t)) - result.add(b.clone().adjust(offset)); - - offset += 1 + c.length(); + /** + * Returns the Text in the Span List in a String...<br> + * all Spans must refer to the text returned by getText(). + */ + public String getText(List<Span> sl) + { + final String temp = getText(); + StringBuilder result = new StringBuilder(); + for (Span s : sl) + result.append(s.getText(temp) + ' '); + result.deleteCharAt(result.length() - 1); + return result.toString(); } - return result; - } - public List<Span> getFormatSpans(FormatType t, int start, int end) { - return getFormatSpans(t, new Span(start, end)); - } + public int length() + { + int length = 0; - public List<Span> getFormatSpans(FormatType t, Span s) { - List<Span> result = new ArrayList<>(); + for (Content cc : ccl) + if (cc != null) + length += cc.length() + 1; - Span a = new Span(-1, -1); + if (length > 0) + length--; - for (Content c : ccl) { - int offset = a.getEnd() + 1; - a = new Span(offset, offset + c.length()); + return length; + } - if (a.hits(s)) { - for (Span b : c.getFormatSpans(t, s.clone().adjust(-offset))) - result.add(b.clone().adjust(offset)); - } + /** + * Retruns the Number of Content elements in this ContentContainer. + */ + public int size() + { + return ccl.size(); } - return result; - } - - public List<FormatType> getFormats() { - - boolean bold = false; - boolean italic = false; - boolean tag = false; - boolean math = false; - boolean nowiki = false; - - for (Content c : ccl) { - - for (FormatType t : c.getFormats()) - switch (t) { - case BOLD: - bold = true; - break; - case ITALIC: - italic = true; - break; - case TAG: - tag = true; - break; - case MATH: - math = true; - break; - case NOWIKI: - nowiki = true; - break; + + public List<Span> getFormatSpans(FormatType t) + { + List<Span> result = new ArrayList<>(); + int offset = 0; + for (Content c : ccl) { + for (Span b : c.getFormatSpans(t)) + result.add(b.clone().adjust(offset)); + + offset += 1 + c.length(); } + return result; + } - if (bold && italic && tag && math && nowiki) break; + public List<Span> getFormatSpans(FormatType t, int start, int end) + { + return getFormatSpans(t, new Span(start, end)); } - List<FormatType> result = new ArrayList<>(); - if (bold) result.add(FormatType.BOLD); - if (italic) result.add(FormatType.ITALIC); - if (tag) result.add(FormatType.TAG); - if (math) result.add(FormatType.MATH); - if (nowiki) result.add(FormatType.NOWIKI); - return result; - } - - public List<FormatType> getFormats(int start, int end) { - return getFormats(new Span(start, end)); - } - - public List<FormatType> getFormats(Span s) { - boolean bold = false; - boolean italic = false; - boolean tag = false; - boolean math = false; - boolean nowiki = false; - - Span a = new Span(-1, -1); - - for (Content c : ccl) { - int offset = a.getEnd() + 1; - a = new Span(offset, offset + c.length()); - - if (a.hits(s)) - for (FormatType t : c.getFormats(s.clone().adjust(-offset))) - switch (t) { - case BOLD: - bold = true; - break; - case ITALIC: - italic = true; - break; - case TAG: - tag = true; - break; - case MATH: - math = true; - break; - case NOWIKI: - nowiki = true; - break; - } - - if (bold && italic) break; + public List<Span> getFormatSpans(FormatType t, Span s) + { + List<Span> result = new ArrayList<>(); + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) { + for (Span b : c.getFormatSpans(t, s.clone().adjust(-offset))) + result.add(b.clone().adjust(offset)); + } + } + return result; } - List<FormatType> result = new ArrayList<>(); - if (bold) result.add(FormatType.BOLD); - if (italic) result.add(FormatType.ITALIC); - if (tag) result.add(FormatType.TAG); - if (math) result.add(FormatType.MATH); - if (nowiki) result.add(FormatType.NOWIKI); - return result; - } + public List<FormatType> getFormats() + { + + boolean bold = false; + boolean italic = false; + boolean tag = false; + boolean math = false; + boolean nowiki = false; + + for (Content c : ccl) { + + for (FormatType t : c.getFormats()) + switch (t) { + case BOLD: + bold = true; + break; + case ITALIC: + italic = true; + break; + case TAG: + tag = true; + break; + case MATH: + math = true; + break; + case NOWIKI: + nowiki = true; + break; + } + + if (bold && italic && tag && math && nowiki) + break; + } - public List<Link> getLinks(Link.type linkType) { - List<Link> result = new ArrayList<>(); - for (Content c : ccl) result.addAll(c.getLinks(linkType)); - return result; - } + List<FormatType> result = new ArrayList<>(); + if (bold) + result.add(FormatType.BOLD); + if (italic) + result.add(FormatType.ITALIC); + if (tag) + result.add(FormatType.TAG); + if (math) + result.add(FormatType.MATH); + if (nowiki) + result.add(FormatType.NOWIKI); + return result; + } - public List<Link> getLinks(Link.type linkType, int start, int end) { - return getLinks(linkType, new Span(start, end)); - } + public List<FormatType> getFormats(int start, int end) + { + return getFormats(new Span(start, end)); + } - public List<Link> getLinks(Link.type linkType, Span s) { - List<Link> result = new ArrayList<>(); + public List<FormatType> getFormats(Span s) + { + boolean bold = false; + boolean italic = false; + boolean tag = false; + boolean math = false; + boolean nowiki = false; + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) + for (FormatType t : c.getFormats(s.clone().adjust(-offset))) + switch (t) { + case BOLD: + bold = true; + break; + case ITALIC: + italic = true; + break; + case TAG: + tag = true; + break; + case MATH: + math = true; + break; + case NOWIKI: + nowiki = true; + break; + } + + if (bold && italic) + break; + } - Span a = new Span(-1, -1); + List<FormatType> result = new ArrayList<>(); + if (bold) + result.add(FormatType.BOLD); + if (italic) + result.add(FormatType.ITALIC); + if (tag) + result.add(FormatType.TAG); + if (math) + result.add(FormatType.MATH); + if (nowiki) + result.add(FormatType.NOWIKI); + return result; + } - for (Content c : ccl) { - int offset = a.getEnd() + 1; - a = new Span(offset, offset + c.length()); + public List<Link> getLinks(Link.type linkType) + { + List<Link> result = new ArrayList<>(); + for (Content c : ccl) + result.addAll(c.getLinks(linkType)); + return result; + } - if (a.hits(s)) - result.addAll(c.getLinks(linkType, s.clone().adjust(-offset))); + public List<Link> getLinks(Link.type linkType, int start, int end) + { + return getLinks(linkType, new Span(start, end)); } - return result; - } - - public List<Link> getLinks() { - List<Link> result = new ArrayList<>(); - for (Content c : ccl) - result.addAll(c.getLinks()); - return result; - } - - public List<Template> getTemplates() { - List<Template> result = new ArrayList<>(); - for (Content cc : ccl) - result.addAll(cc.getTemplates()); - return result; - } - - public List<Template> getTemplates(int start, int end) { - return getTemplates(new Span(start, end)); - } - - public List<Template> getTemplates(Span s) { - List<Template> result = new ArrayList<>(); - - Span a = new Span(-1, -1); - - for (Content c : ccl) { - int offset = a.getEnd() + 1; - a = new Span(offset, offset + c.length()); - - if (a.hits(s)) - result.addAll(c.getTemplates(s.clone().adjust(-offset))); + + public List<Link> getLinks(Link.type linkType, Span s) + { + List<Link> result = new ArrayList<>(); + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) + result.addAll(c.getLinks(linkType, s.clone().adjust(-offset))); + } + return result; + } + + public List<Link> getLinks() + { + List<Link> result = new ArrayList<>(); + for (Content c : ccl) + result.addAll(c.getLinks()); + return result; + } + + public List<Template> getTemplates() + { + List<Template> result = new ArrayList<>(); + for (Content cc : ccl) + result.addAll(cc.getTemplates()); + return result; + } + + public List<Template> getTemplates(int start, int end) + { + return getTemplates(new Span(start, end)); + } + + public List<Template> getTemplates(Span s) + { + List<Template> result = new ArrayList<>(); + + Span a = new Span(-1, -1); + + for (Content c : ccl) { + int offset = a.getEnd() + 1; + a = new Span(offset, offset + c.length()); + + if (a.hits(s)) + result.addAll(c.getTemplates(s.clone().adjust(-offset))); + } + return result; } - return result; - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java index 805378b7..d5e826fc 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ContentElement.java @@ -21,294 +21,345 @@ import java.util.List; /** - * This is the Simple implementation of the Content Inteface, and is used - * for nearly all content containing classes... + * This is the Simple implementation of the Content Inteface, and is used for nearly all content + * containing classes... * <p> * Be aware, that all retured Spans refer to the String returned by getText()<br> */ -public class ContentElement extends ParsedPageObject implements Content { - - private String text; - private List<Span> boldSpans; - private List<Span> italicSpans; - private List<Link> links; - private List<Template> templates; - private List<Span> tags; - private List<Span> mathSpans; - private List<Span> noWikiSpans; - - public ContentElement() { - text = ""; - links = new ArrayList<>(); - templates = new ArrayList<>(); - boldSpans = new ArrayList<>(); - italicSpans = new ArrayList<>(); - tags = new ArrayList<>(); - mathSpans = new ArrayList<>(); - noWikiSpans = new ArrayList<>(); - } - - /** - * Look at getText() for Details... - */ - public void setText(String text) { - this.text = text; - } - - /** - * Returns the Text on wich all elements of this ContentElement are bases on. - */ - public String getText() { - return text; - } - - /** - * Returns the Text defined with the Spans in the List divided by a WS - */ - public String getText(List<Span> sl) { - StringBuilder result = new StringBuilder(); - - for (Span s : sl) { - result.append(s.getText(text) + ' '); +public class ContentElement + extends ParsedPageObject + implements Content +{ + + private String text; + private List<Span> boldSpans; + private List<Span> italicSpans; + private List<Link> links; + private List<Template> templates; + private List<Span> tags; + private List<Span> mathSpans; + private List<Span> noWikiSpans; + + public ContentElement() + { + text = ""; + links = new ArrayList<>(); + templates = new ArrayList<>(); + boldSpans = new ArrayList<>(); + italicSpans = new ArrayList<>(); + tags = new ArrayList<>(); + mathSpans = new ArrayList<>(); + noWikiSpans = new ArrayList<>(); } - int delChar = result.length() - 1; - if (delChar > 0) result.deleteCharAt(delChar); - - return result.toString(); - } - - /** - * Retruns the length of the Text. Alternativ you can use getText().length() - */ - public int length() { - return text.length(); - } - - /** - * Returns true if there is no Content in this ContentElement. - */ - public boolean empty() { - return - text.length() == 0 && - links.size() == 0 && - templates.size() == 0 && - tags.size() == 0 && - mathSpans.size() == 0; - } - - /** - * Look at getFormatSpans for Details... - */ - public void setFormatSpans(FormatType t, List<Span> spans) { - switch (t) { - case BOLD: - boldSpans = spans; - break; - - case ITALIC: - italicSpans = spans; - break; - - case TAG: - tags = spans; - break; - - case MATH: - mathSpans = spans; - break; - - case NOWIKI: - noWikiSpans = spans; - break; + + /** + * Look at getText() for Details... + */ + public void setText(String text) + { + this.text = text; + } + + /** + * Returns the Text on wich all elements of this ContentElement are bases on. + */ + public String getText() + { + return text; + } + + /** + * Returns the Text defined with the Spans in the List divided by a WS + */ + public String getText(List<Span> sl) + { + StringBuilder result = new StringBuilder(); + + for (Span s : sl) { + result.append(s.getText(text) + ' '); + } + int delChar = result.length() - 1; + if (delChar > 0) + result.deleteCharAt(delChar); + + return result.toString(); + } + + /** + * Retruns the length of the Text. Alternativ you can use getText().length() + */ + public int length() + { + return text.length(); + } + + /** + * Returns true if there is no Content in this ContentElement. + */ + public boolean empty() + { + return text.length() == 0 && links.size() == 0 && templates.size() == 0 && tags.size() == 0 + && mathSpans.size() == 0; + } + + /** + * Look at getFormatSpans for Details... + */ + public void setFormatSpans(FormatType t, List<Span> spans) + { + switch (t) { + case BOLD: + boldSpans = spans; + break; + + case ITALIC: + italicSpans = spans; + break; + + case TAG: + tags = spans; + break; + + case MATH: + mathSpans = spans; + break; + + case NOWIKI: + noWikiSpans = spans; + break; + } + } + + /** + * Returns all the Spans of the Format type t. + */ + public List<Span> getFormatSpans(FormatType t) + { + switch (t) { + case BOLD: + return boldSpans; + case ITALIC: + return italicSpans; + case TAG: + return tags; + case MATH: + return mathSpans; + case NOWIKI: + return noWikiSpans; + default: + return null; + } + } + + /** + * Returns all the Spans of the Format type t in the Range of start to end + */ + public List<Span> getFormatSpans(FormatType t, int start, int end) + { + return getFormatSpans(t, new Span(start, end)); + } + + /** + * Returns all the Spans of the Format type t in the Range of the Span s + */ + public List<Span> getFormatSpans(FormatType t, Span s) + { + List<Span> result = new ArrayList<>(); + for (Span s2 : getFormatSpans(t)) + if (s2.hits(s)) + result.add(s2); + return result; + } + + /** + * Returns the Formats wich are used in this ContentElement in a List. + */ + public List<FormatType> getFormats() + { + List<FormatType> ftl = new ArrayList<>(); + if (boldSpans.size() != 0) + ftl.add(FormatType.BOLD); + if (italicSpans.size() != 0) + ftl.add(FormatType.ITALIC); + if (tags.size() != 0) + ftl.add(FormatType.TAG); + if (mathSpans.size() != 0) + ftl.add(FormatType.MATH); + if (noWikiSpans.size() != 0) + ftl.add(FormatType.NOWIKI); + return ftl; + } + + /** + * Returns the Formats wich are used in this ContentElement, in the Range from start to end, in + * a List. + */ + public List<FormatType> getFormats(int start, int end) + { + return getFormats(new Span(start, end)); } - } - - /** - * Returns all the Spans of the Format type t. - */ - public List<Span> getFormatSpans(FormatType t) { - switch (t) { - case BOLD: - return boldSpans; - case ITALIC: - return italicSpans; - case TAG: - return tags; - case MATH: - return mathSpans; - case NOWIKI: - return noWikiSpans; - default: - return null; + + /** + * Returns the Formats wich are used in this ContentElement, in the Range of the Span s, in a + * List. + */ + public List<FormatType> getFormats(Span s) + { + List<FormatType> result = new ArrayList<>(); + for (Span s2 : boldSpans) + if (s.hits(s2)) { + result.add(FormatType.BOLD); + break; + } + + for (Span s2 : italicSpans) + if (s.hits(s2)) { + result.add(FormatType.ITALIC); + break; + } + + return result; + } + + /** + * Look at getLinks() for Details... + */ + public void setLinks(List<Link> links) + { + this.links = links; + } + + /** + * Retruns a List of the Links of this ContentElement + */ + public List<Link> getLinks() + { + return links; + } + + /** + * Returns a List of the Links of this ContentElement of the Specified Link.type t + */ + public List<Link> getLinks(Link.type t) + { + List<Link> result = new ArrayList<>(); + for (Link l : links) + if (l.getType() == t) + result.add(l); + return result; + } + + /** + * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range + * of s + */ + public List<Link> getLinks(Link.type t, Span s) + { + List<Link> result = new ArrayList<>(); + for (Link l : links) + if (l.getType() == t && l.getPos().hits(s)) + result.add(l); + return result; + } + + /** + * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range + * of start to end + */ + public List<Link> getLinks(Link.type t, int begin, int end) + { + return getLinks(t, new Span(begin, end)); + } + + /** + * Look at getTemplates for Details... + */ + public void setTemplates(List<Template> templates) + { + this.templates = templates; + } + + /** + * Returns a List of the Templates of this ContentElement. + */ + public List<Template> getTemplates() + { + return templates; + } + + /** + * Returns a List of the Templates of this ContentElement in the Range from start to end + */ + public List<Template> getTemplates(int start, int end) + { + return getTemplates(new Span(start, end)); + } + + /** + * Returns a List of the Templates of this ContentElement in the Range of s + */ + public List<Template> getTemplates(Span s) + { + List<Template> result = new ArrayList<>(); + for (Template t : templates) + if (t.getPos().hits(s)) + result.add(t); + return result; + } + + /** + * Try and find out ;-) + */ + public String toString() + { + StringBuilder result = new StringBuilder(); + result.append("CE_TEXT: \"" + text + "\""); + + result.append("\nCE_BOLD_SPANS: "); + if (boldSpans != null) { + result.append(boldSpans.size()); + for (Span s : boldSpans) + result.append("\n\t" + s + " : \"" + s.getText(text) + "\""); + } + else + result.append("ERROR: boldSpans == null"); + + result.append("\nCE_ITALIC_SPANS: "); + if (italicSpans != null) { + result.append(italicSpans.size()); + for (Span s : italicSpans) + result.append("\n\t" + s + " : \"" + s.getText(text) + "\""); + } + else + result.append("ERROR: italicSpans == null"); + + result.append("\nCE_LINKS: "); + if (links != null) { + result.append(links.size()); + for (Link l : links) + result.append("\n" + l); + } + else + result.append("ERROR: links == null"); + + result.append("\nCE_TEMPLATES: "); + if (templates != null) { + result.append(templates.size()); + for (Template t : templates) + result.append("\n" + t); + } + else + result.append("ERROR: templates == null"); + + result.append("\nCE_TAGS: "); + if (templates != null) { + result.append(tags.size()); + for (Span s : tags) + result.append("\n" + s); + } + else + result.append("ERROR: templates == null"); + + return result.toString(); } - } - - /** - * Returns all the Spans of the Format type t in the Range of start to end - */ - public List<Span> getFormatSpans(FormatType t, int start, int end) { - return getFormatSpans(t, new Span(start, end)); - } - - /** - * Returns all the Spans of the Format type t in the Range of the Span s - */ - public List<Span> getFormatSpans(FormatType t, Span s) { - List<Span> result = new ArrayList<>(); - for (Span s2 : getFormatSpans(t)) - if (s2.hits(s)) result.add(s2); - return result; - } - - /** - * Returns the Formats wich are used in this ContentElement in a List. - */ - public List<FormatType> getFormats() { - List<FormatType> ftl = new ArrayList<>(); - if (boldSpans.size() != 0) ftl.add(FormatType.BOLD); - if (italicSpans.size() != 0) ftl.add(FormatType.ITALIC); - if (tags.size() != 0) ftl.add(FormatType.TAG); - if (mathSpans.size() != 0) ftl.add(FormatType.MATH); - if (noWikiSpans.size() != 0) ftl.add(FormatType.NOWIKI); - return ftl; - } - - /** - * Returns the Formats wich are used in this ContentElement, in the Range from start to end, in a List. - */ - public List<FormatType> getFormats(int start, int end) { - return getFormats(new Span(start, end)); - } - - /** - * Returns the Formats wich are used in this ContentElement, in the Range of the Span s, in a List. - */ - public List<FormatType> getFormats(Span s) { - List<FormatType> result = new ArrayList<>(); - for (Span s2 : boldSpans) - if (s.hits(s2)) { - result.add(FormatType.BOLD); - break; - } - - for (Span s2 : italicSpans) - if (s.hits(s2)) { - result.add(FormatType.ITALIC); - break; - } - - return result; - } - - /** - * Look at getLinks() for Details... - */ - public void setLinks(List<Link> links) { - this.links = links; - } - - /** - * Retruns a List of the Links of this ContentElement - */ - public List<Link> getLinks() { - return links; - } - - /** - * Returns a List of the Links of this ContentElement of the Specified Link.type t - */ - public List<Link> getLinks(Link.type t) { - List<Link> result = new ArrayList<>(); - for (Link l : links) - if (l.getType() == t) result.add(l); - return result; - } - - /** - * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range of s - */ - public List<Link> getLinks(Link.type t, Span s) { - List<Link> result = new ArrayList<>(); - for (Link l : links) - if (l.getType() == t && l.getPos().hits(s)) result.add(l); - return result; - } - - /** - * Returns a List of the Links of this ContentElement of the Specified Link.type t in the Range of start to end - */ - public List<Link> getLinks(Link.type t, int begin, int end) { - return getLinks(t, new Span(begin, end)); - } - - /** - * Look at getTemplates for Details... - */ - public void setTemplates(List<Template> templates) { - this.templates = templates; - } - - /** - * Returns a List of the Templates of this ContentElement. - */ - public List<Template> getTemplates() { - return templates; - } - - /** - * Returns a List of the Templates of this ContentElement in the Range from start to end - */ - public List<Template> getTemplates(int start, int end) { - return getTemplates(new Span(start, end)); - } - - /** - * Returns a List of the Templates of this ContentElement in the Range of s - */ - public List<Template> getTemplates(Span s) { - List<Template> result = new ArrayList<>(); - for (Template t : templates) - if (t.getPos().hits(s)) result.add(t); - return result; - } - - /** - * Try and find out ;-) - */ - public String toString() { - StringBuilder result = new StringBuilder(); - result.append("CE_TEXT: \"" + text + "\""); - - result.append("\nCE_BOLD_SPANS: "); - if (boldSpans != null) { - result.append(boldSpans.size()); - for (Span s : boldSpans) result.append("\n\t" + s + " : \"" + s.getText(text) + "\""); - } else result.append("ERROR: boldSpans == null"); - - result.append("\nCE_ITALIC_SPANS: "); - if (italicSpans != null) { - result.append(italicSpans.size()); - for (Span s : italicSpans) result.append("\n\t" + s + " : \"" + s.getText(text) + "\""); - } else result.append("ERROR: italicSpans == null"); - - result.append("\nCE_LINKS: "); - if (links != null) { - result.append(links.size()); - for (Link l : links) result.append("\n" + l); - } else result.append("ERROR: links == null"); - - result.append("\nCE_TEMPLATES: "); - if (templates != null) { - result.append(templates.size()); - for (Template t : templates) result.append("\n" + t); - } else result.append("ERROR: templates == null"); - - result.append("\nCE_TAGS: "); - if (templates != null) { - result.append(tags.size()); - for (Span s : tags) result.append("\n" + s); - } else result.append("ERROR: templates == null"); - - return result.toString(); - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java index 8674126c..51c97f9e 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/DefinitionList.java @@ -23,88 +23,110 @@ /** * In a definition List exist a Defined Term with Zero or more Definitions. */ -public class DefinitionList extends ContentContainer { - - private ContentElement definedTerm; - private final List<ContentElement> definitions; - - public DefinitionList() { - this.ccl = new ArrayList<>(); - this.definedTerm = null; - this.definitions = new ArrayList<>(); - } - - public DefinitionList(ContentElement definedTerm, List<ContentElement> definitions) { - this.ccl = new ArrayList<>(); - this.definedTerm = definedTerm; - this.definitions = definitions; - ccl.add(definedTerm); - ccl.addAll(definitions); - } - - /** - * content = definedTerm[+definition]* - */ - public DefinitionList(List<ContentElement> content) { - this.ccl = new ArrayList<>(content); - this.definitions = new ArrayList<>(); - - if (content.size() > 0) { - this.definedTerm = content.get(0); - if (content.size() > 1) { - this.definitions.addAll(content); - this.definitions.remove(0); - } - } else this.definedTerm = null; - } - - public String toString() { - StringBuilder result = new StringBuilder(); - - result.append("DL_DEFINEDTERM:\n"); - result.append(definedTerm); - - if (definitions.size() != 0) { - result.append("\nDL_DEFINITIONS:"); - for (ContentElement ce : definitions) result.append("\n" + ce); +public class DefinitionList + extends ContentContainer +{ + + private ContentElement definedTerm; + private final List<ContentElement> definitions; + + public DefinitionList() + { + this.ccl = new ArrayList<>(); + this.definedTerm = null; + this.definitions = new ArrayList<>(); } - return result.toString(); - } + public DefinitionList(ContentElement definedTerm, List<ContentElement> definitions) + { + this.ccl = new ArrayList<>(); + this.definedTerm = definedTerm; + this.definitions = definitions; + ccl.add(definedTerm); + ccl.addAll(definitions); + } + + /** + * content = definedTerm[+definition]* + */ + public DefinitionList(List<ContentElement> content) + { + this.ccl = new ArrayList<>(content); + this.definitions = new ArrayList<>(); + + if (content.size() > 0) { + this.definedTerm = content.get(0); + if (content.size() > 1) { + this.definitions.addAll(content); + this.definitions.remove(0); + } + } + else + this.definedTerm = null; + } + + public String toString() + { + StringBuilder result = new StringBuilder(); - public ContentElement getDefinedTerm() { - return definedTerm; - } + result.append("DL_DEFINEDTERM:\n"); + result.append(definedTerm); + + if (definitions.size() != 0) { + result.append("\nDL_DEFINITIONS:"); + for (ContentElement ce : definitions) + result.append("\n" + ce); + } + + return result.toString(); + } - public void setDefinedTerm(ContentElement definedTerm) { - if (definedTerm != null) { - if (this.definedTerm == null) ccl.add(0, definedTerm); - else ccl.set(0, definedTerm); - } else if (this.definedTerm != null) ccl.remove(this.definedTerm); + public ContentElement getDefinedTerm() + { + return definedTerm; + } - this.definedTerm = definedTerm; - } + public void setDefinedTerm(ContentElement definedTerm) + { + if (definedTerm != null) { + if (this.definedTerm == null) + ccl.add(0, definedTerm); + else + ccl.set(0, definedTerm); + } + else if (this.definedTerm != null) + ccl.remove(this.definedTerm); + + this.definedTerm = definedTerm; + } - public int nrOfDefinitions() { - return definitions.size(); - } + public int nrOfDefinitions() + { + return definitions.size(); + } - public void removeDefinition(ContentElement ce) { - definitions.remove(ce); - ccl.remove(ce); - } + public void removeDefinition(ContentElement ce) + { + definitions.remove(ce); + ccl.remove(ce); + } - public void addDefiniton(ContentElement ce) { - definitions.add(ce); - ccl.add(ce); - } + public void addDefiniton(ContentElement ce) + { + definitions.add(ce); + ccl.add(ce); + } - public ContentElement getDefinition(int i) { - if (definitions.size() > i) return definitions.get(i); - else return null; - } + public ContentElement getDefinition(int i) + { + if (definitions.size() > i) + return definitions.get(i); + else + return null; + } - public List<ContentElement> getDefinitions() { - return new ArrayList<>(definitions); - } + public List<ContentElement> getDefinitions() + { + return new ArrayList<>(definitions); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java index bd5a392a..4173f816 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Link.java @@ -20,127 +20,140 @@ import java.util.ArrayList; import java.util.List; -public class Link extends ParsedPageObject { - - private Content home_cc; - private final type t; - private final Span pos; - private final String target; - private final List<String> parameters; - - public enum type {EXTERNAL, INTERNAL, AUDIO, VIDEO, IMAGE, UNKNOWN} - - public Link(Content home_cc, Span linkPos, String target, type t, List<String> parameters) { - this.home_cc = home_cc; - this.pos = linkPos; - this.target = target; - this.t = t; - this.parameters = (parameters == null ? new ArrayList<>() : parameters); - } - - /** - * Returns the Content Element in wich the Link occures. - */ - public Content getHomeElement() { - return home_cc; - } - - public Link setHomeElement(Content home_cc) { - this.home_cc = home_cc; - return this; - } - - /** - * Returns the Type of the Link. - */ - public type getType() { - return t; - } - - /** - * Retruns the Position Span of the Link, wich refers to getHomeElement().getText(). - */ - public Span getPos() { - return pos; - } - - /** - * Retruns the Target of the Link. - */ - public String getTarget() { - return target; - } - - /** - * Returns a List of Parameters for this Link, in most cases the size of the list will be 0. - */ - public List<String> getParameters() { - return parameters; - } - - /** - * Retruns the Link text or link caption. - */ - public String getText() { - if (home_cc == null) { - return null; +public class Link + extends ParsedPageObject +{ + + private Content home_cc; + private final type t; + private final Span pos; + private final String target; + private final List<String> parameters; + + public enum type + { + EXTERNAL, INTERNAL, AUDIO, VIDEO, IMAGE, UNKNOWN } - return pos.getText(home_cc.getText()); - } - - /** - * Returns the Number of Words left and right of the Link, in the Bounds of the - * HomeElement of this Link. - */ - public String getContext(int wordsLeft, int wordsRight) { - final String text = home_cc.getText(); - int temp; - - // get the left start position - int posLeft = pos.getStart(); - temp = posLeft - 1; - while (posLeft != 0 && wordsLeft > 0) { - while (temp > 0 && text.charAt(temp) < 48) { - temp--; - } - while (temp > 0 && text.charAt(temp) >= 48) { - temp--; - } - posLeft = (temp > 0 ? temp + 1 : 0); - wordsLeft--; + + public Link(Content home_cc, Span linkPos, String target, type t, List<String> parameters) + { + this.home_cc = home_cc; + this.pos = linkPos; + this.target = target; + this.t = t; + this.parameters = (parameters == null ? new ArrayList<>() : parameters); + } + + /** + * Returns the Content Element in wich the Link occures. + */ + public Content getHomeElement() + { + return home_cc; + } + + public Link setHomeElement(Content home_cc) + { + this.home_cc = home_cc; + return this; + } + + /** + * Returns the Type of the Link. + */ + public type getType() + { + return t; + } + + /** + * Retruns the Position Span of the Link, wich refers to getHomeElement().getText(). + */ + public Span getPos() + { + return pos; + } + + /** + * Retruns the Target of the Link. + */ + public String getTarget() + { + return target; + } + + /** + * Returns a List of Parameters for this Link, in most cases the size of the list will be 0. + */ + public List<String> getParameters() + { + return parameters; + } + + /** + * Retruns the Link text or link caption. + */ + public String getText() + { + if (home_cc == null) { + return null; + } + return pos.getText(home_cc.getText()); } - // get the right end position - int posRight = pos.getEnd(); - temp = posRight; - while (posRight != text.length() && wordsRight > 0) { - while (temp < text.length() && text.charAt(temp) < 48) { - temp++; - } - while (temp < text.length() && text.charAt(temp) >= 48) { - temp++; - } - posRight = temp; - wordsRight--; + /** + * Returns the Number of Words left and right of the Link, in the Bounds of the HomeElement of + * this Link. + */ + public String getContext(int wordsLeft, int wordsRight) + { + final String text = home_cc.getText(); + int temp; + + // get the left start position + int posLeft = pos.getStart(); + temp = posLeft - 1; + while (posLeft != 0 && wordsLeft > 0) { + while (temp > 0 && text.charAt(temp) < 48) { + temp--; + } + while (temp > 0 && text.charAt(temp) >= 48) { + temp--; + } + posLeft = (temp > 0 ? temp + 1 : 0); + wordsLeft--; + } + + // get the right end position + int posRight = pos.getEnd(); + temp = posRight; + while (posRight != text.length() && wordsRight > 0) { + while (temp < text.length() && text.charAt(temp) < 48) { + temp++; + } + while (temp < text.length() && text.charAt(temp) >= 48) { + temp++; + } + posRight = temp; + wordsRight--; + } + + // retrun a string... + return text.substring(posLeft, pos.getStart()) + text.substring(pos.getEnd(), posRight); } - // retrun a string... - return - text.substring(posLeft, pos.getStart()) + - text.substring(pos.getEnd(), posRight); - } - - @Override - public String toString() { - StringBuilder result = new StringBuilder(); - result.append("LI_TYPE: " + t); - result.append("\nLI_TARGET: \"" + target + "\""); - result.append("\nLI_TEXT: \"" + getText() + "\""); - result.append("\nLI_POSITION: \"" + pos + "\""); - result.append("\nLI_PARAMETERS: " + parameters.size()); - for (String s : parameters) { - result.append("\nLI_PARAMETER: \"" + s + "\""); + @Override + public String toString() + { + StringBuilder result = new StringBuilder(); + result.append("LI_TYPE: " + t); + result.append("\nLI_TARGET: \"" + target + "\""); + result.append("\nLI_TEXT: \"" + getText() + "\""); + result.append("\nLI_POSITION: \"" + pos + "\""); + result.append("\nLI_PARAMETERS: " + parameters.size()); + for (String s : parameters) { + result.append("\nLI_PARAMETER: \"" + s + "\""); + } + return result.toString(); } - return result.toString(); - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java index 9726fd80..88b8a9e3 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/LinkAnchorExtractor.java @@ -29,93 +29,99 @@ import org.dkpro.jwpl.parser.mediawiki.MediaWikiParser; import org.dkpro.jwpl.parser.mediawiki.MediaWikiParserFactory; -public class LinkAnchorExtractor { +public class LinkAnchorExtractor +{ - private final MediaWikiParser parser; + private final MediaWikiParser parser; - public LinkAnchorExtractor() { - MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); - parser = pf.createParser(); - } + public LinkAnchorExtractor() + { + MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); + parser = pf.createParser(); + } - public LinkAnchorExtractor(Language lang) { - MediaWikiParserFactory pf = new MediaWikiParserFactory(lang); - parser = pf.createParser(); - } + public LinkAnchorExtractor(Language lang) + { + MediaWikiParserFactory pf = new MediaWikiParserFactory(lang); + parser = pf.createParser(); + } - public LinkAnchorExtractor(MediaWikiParser parser) { - this.parser = parser; - } + public LinkAnchorExtractor(MediaWikiParser parser) + { + this.parser = parser; + } - /** - * Note that this method only returns the anchors that are not equal to the page's title. - * Anchors might contain references to sections in an article in the form of "Page#Section". - * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. - * - * @return A set of strings used as anchor texts in links pointing to that page. - * @throws WikiTitleParsingException - */ - public Set<String> getInlinkAnchors(Page page) - throws WikiTitleParsingException { - Set<String> inAnchors = new HashSet<>(); - for (Page p : page.getInlinks()) { - ParsedPage pp = parser.parse(p.getText()); - if (pp == null) { - return inAnchors; - } - for (Link l : pp.getLinks()) { - String pageTitle = page.getTitle().getPlainTitle(); + /** + * Note that this method only returns the anchors that are not equal to the page's title. + * Anchors might contain references to sections in an article in the form of "Page#Section". If + * you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title + * object can be used. + * + * @return A set of strings used as anchor texts in links pointing to that page. + * @throws WikiTitleParsingException + */ + public Set<String> getInlinkAnchors(Page page) throws WikiTitleParsingException + { + Set<String> inAnchors = new HashSet<>(); + for (Page p : page.getInlinks()) { + ParsedPage pp = parser.parse(p.getText()); + if (pp == null) { + return inAnchors; + } + for (Link l : pp.getLinks()) { + String pageTitle = page.getTitle().getPlainTitle(); - String anchorText = l.getText(); - if (l.getTarget().equals(pageTitle) && !anchorText.equals(pageTitle)) { - inAnchors.add(anchorText); + String anchorText = l.getText(); + if (l.getTarget().equals(pageTitle) && !anchorText.equals(pageTitle)) { + inAnchors.add(anchorText); + } + } } - } + return inAnchors; } - return inAnchors; - } - /** - * Note that this method only returns the anchors that are not equal to the title of the page - * they are pointing to. - * Anchors might contain references to sections in an article in the form of "Page#Section". - * If you need the plain title, e.g. for checking whether the page exists in Wikipedia, the Title object can be used. - * - * @return A mapping from the page titles of links in that page to the anchor texts used in the - * links. - * @throws WikiTitleParsingException - */ - public Map<String, Set<String>> getOutlinkAnchors(Page page) - throws WikiTitleParsingException { - Map<String, Set<String>> outAnchors = new HashMap<>(); - ParsedPage pp = parser.parse(page.getText()); - if (pp == null) { - return outAnchors; - } - for (Link l : pp.getLinks()) { - if (l.getTarget().length() == 0) { - continue; - } + /** + * Note that this method only returns the anchors that are not equal to the title of the page + * they are pointing to. Anchors might contain references to sections in an article in the form + * of "Page#Section". If you need the plain title, e.g. for checking whether the page exists in + * Wikipedia, the Title object can be used. + * + * @return A mapping from the page titles of links in that page to the anchor texts used in the + * links. + * @throws WikiTitleParsingException + */ + public Map<String, Set<String>> getOutlinkAnchors(Page page) throws WikiTitleParsingException + { + Map<String, Set<String>> outAnchors = new HashMap<>(); + ParsedPage pp = parser.parse(page.getText()); + if (pp == null) { + return outAnchors; + } + for (Link l : pp.getLinks()) { + if (l.getTarget().length() == 0) { + continue; + } - String targetTitle = new Title(l.getTarget()).getPlainTitle(); - if (!l.getType().equals(Link.type.EXTERNAL) && !l.getType().equals(Link.type.IMAGE) - && !l.getType().equals(Link.type.AUDIO) && !l.getType().equals(Link.type.VIDEO) - && !targetTitle.contains(":")) // Wikipedia titles only contain colons if they - // are categories or other meta data - { - String anchorText = l.getText(); - if (!anchorText.equals(targetTitle)) { - Set<String> anchors; - if (outAnchors.containsKey(targetTitle)) { - anchors = outAnchors.get(targetTitle); - } else { - anchors = new HashSet<>(); - } - anchors.add(anchorText); - outAnchors.put(targetTitle, anchors); + String targetTitle = new Title(l.getTarget()).getPlainTitle(); + if (!l.getType().equals(Link.type.EXTERNAL) && !l.getType().equals(Link.type.IMAGE) + && !l.getType().equals(Link.type.AUDIO) && !l.getType().equals(Link.type.VIDEO) + && !targetTitle.contains(":")) // Wikipedia titles only contain colons if they + // are categories or other meta data + { + String anchorText = l.getText(); + if (!anchorText.equals(targetTitle)) { + Set<String> anchors; + if (outAnchors.containsKey(targetTitle)) { + anchors = outAnchors.get(targetTitle); + } + else { + anchors = new HashSet<>(); + } + anchors.add(anchorText); + outAnchors.put(targetTitle, anchors); + } + } } - } + return outAnchors; } - return outAnchors; - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java index b56b7b40..a9a0f610 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedList.java @@ -18,12 +18,14 @@ package org.dkpro.jwpl.parser; /** - * A NestedList can contain ContentElements or other NestedLists, - * for this purpose and to avoid a improper use, this interface has been created.<br> + * A NestedList can contain ContentElements or other NestedLists, for this purpose and to avoid a + * improper use, this interface has been created.<br> * <p> * Now, we got a NestedListContainer wich contains NestedLists<br> * A NestedList can be a NestedListContainer or a NestedListElement. */ -public interface NestedList extends Content { - SrcSpan getSrcSpan(); +public interface NestedList + extends Content +{ + SrcSpan getSrcSpan(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java index 42c66798..8bdb68d8 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListContainer.java @@ -23,52 +23,65 @@ /** * Take a Look a NestedList description first. */ -public class NestedListContainer extends ContentContainer implements NestedList { +public class NestedListContainer + extends ContentContainer + implements NestedList +{ - private final List<NestedList> lists; - private final boolean numbered; + private final List<NestedList> lists; + private final boolean numbered; - public NestedListContainer(boolean numbered) { - this.ccl = new ArrayList<>(); - this.lists = new ArrayList<>(); - this.numbered = numbered; - } + public NestedListContainer(boolean numbered) + { + this.ccl = new ArrayList<>(); + this.lists = new ArrayList<>(); + this.numbered = numbered; + } - /** - * Returns if the NestedList is a numbered or a unnumbered/pointed NestedList - */ - public boolean isNumbered() { - return numbered; - } + /** + * Returns if the NestedList is a numbered or a unnumbered/pointed NestedList + */ + public boolean isNumbered() + { + return numbered; + } - /** - * Returns the NestedListContainer or NestedListElement at Positon i. - */ - public NestedList getNestedList(int i) { - if (i < lists.size()) return lists.get(i); - else return null; - } + /** + * Returns the NestedListContainer or NestedListElement at Positon i. + */ + public NestedList getNestedList(int i) + { + if (i < lists.size()) + return lists.get(i); + else + return null; + } - public void add(NestedList nl) { - lists.add(nl); - ccl.add(nl); - } + public void add(NestedList nl) + { + lists.add(nl); + ccl.add(nl); + } - public void remove(NestedList nl) { - lists.remove(nl); - ccl.remove(nl); - } + public void remove(NestedList nl) + { + lists.remove(nl); + ccl.remove(nl); + } - public List<NestedList> getNestedLists() { - return new ArrayList<>(lists); - } + public List<NestedList> getNestedLists() + { + return new ArrayList<>(lists); + } - public String toString() { - StringBuilder result = new StringBuilder(); - result.append("NLS_NUMBERD: " + numbered); - result.append("\nNLS_CONTENT: false"); - result.append("\nNLS_NESTEDTLISTS: " + lists.size()); - for (NestedList l : lists) result.append("\nNLS_NESTEDLIST:\n" + l); - return result.toString(); - } + public String toString() + { + StringBuilder result = new StringBuilder(); + result.append("NLS_NUMBERD: " + numbered); + result.append("\nNLS_CONTENT: false"); + result.append("\nNLS_NESTEDTLISTS: " + lists.size()); + for (NestedList l : lists) + result.append("\nNLS_NESTEDLIST:\n" + l); + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java index 7ab89a79..4b8a0d4c 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/NestedListElement.java @@ -20,9 +20,13 @@ /** * This is a simple ContentElement, wich occures in a NestedList. */ -public class NestedListElement extends ContentElement implements NestedList { +public class NestedListElement + extends ContentElement + implements NestedList +{ - public String toString() { - return "NLC_IS_CONTENT: true\n" + super.toString(); - } + public String toString() + { + return "NLC_IS_CONTENT: true\n" + super.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java index 89694505..0e291bb5 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Paragraph.java @@ -20,33 +20,43 @@ /** * This is a simple ContentElement extende with a Paragraph Type. */ -public class Paragraph extends ContentElement { - - public enum type {NORMAL, BOXED, INDENTED} - - private type t; - - public Paragraph() { - super(); - } - - public Paragraph(type t) { - super(); - this.t = t; - } - - public String toString() { - StringBuilder result = new StringBuilder(); - result.append(super.toString()); - result.append(System.getProperty("line.separator") + "PA_TYPE: " + t); - return result.toString(); - } - - public void setType(type t) { - this.t = t; - } - - public type getType() { - return t; - } +public class Paragraph + extends ContentElement +{ + + public enum type + { + NORMAL, BOXED, INDENTED + } + + private type t; + + public Paragraph() + { + super(); + } + + public Paragraph(type t) + { + super(); + this.t = t; + } + + public String toString() + { + StringBuilder result = new StringBuilder(); + result.append(super.toString()); + result.append(System.getProperty("line.separator") + "PA_TYPE: " + t); + return result.toString(); + } + + public void setType(type t) + { + this.t = t; + } + + public type getType() + { + return t; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java index 7663b78d..7576d2a8 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPage.java @@ -25,384 +25,439 @@ /** * Provides access to structured information about a MediaWiki article page. */ -public class ParsedPage { - - private String name; - private int pageId; - private SectionContainer superSection; - private ContentElement categories; - private ContentElement languages; - - private int firstParagraphNr; -// private ContentElement aboutArticle; - - /** - * Constructor for a blank ParsedPage.<br> - * Only needed, if you want to create a Wikipedia article from scratch. - * <p> - * Creating a ParsedPage from a Wikipedia article requires to create a parser object first. - */ - public ParsedPage() { - this.superSection = new SectionContainer(null, 0); - } - - /** - * Sets the name of a parsed page. - * - * @param name A name for the parsed page. - */ - public void setName(String name) { - this.name = name; - } - - /** - * The name of a parsed page. - * - * @return The name of a parsed page. - */ - public String getName() { - return name; - } - - /** - * Set the pageId of a parsed page. - * - * @param pageId A pageId for the parsed page. - */ - public void setPageId(int pageId) { - this.pageId = pageId; - } - - /** - * The pageId of a parsed page. - * - * @return The pageId of a parsed page. - */ - public int getPageId() { - return pageId; - } - - /** - * Sets the category element of a parsed page. - * - * @param categories A ContentElement containg the categories of a page. - */ - public void setCategoryElement(ContentElement categories) { - this.categories = categories; - } - - /** - * The ContentElement with links to a page's categories. - * - * @return The ContentElement with links to a page's categories. - */ - public ContentElement getCategoryElement() { - return this.categories; - } - - /** - * Returns a list of category Link objects. - * This is a shortcut for writing getCategoryElemement.getLinks(); - * - * @return A list of category links. - */ - public List<Link> getCategories() { - if (categories == null) { - return new ArrayList<>(); - } - - return categories.getLinks(); - } - - /** - * Sets the number of the first paragraph. - * - * @param nr The number of the first paragraph. - */ - public void setFirstParagraphNr(int nr) { - this.firstParagraphNr = nr; - } - - /** - * Returns the number of the first paragraph. - * - * @return The number of the first paragraph. - */ - public int getFirstParagraphNr() { - return firstParagraphNr; - } - - /** - * Returns the first paragraph.<br> - * This is a shortcut for getParagraph( getFirstParagraphNr() ). - * It is <b>not</b> the same as getParagraph( 0 ), because the physically first paragraph often contain tables etc. - */ - public Paragraph getFirstParagraph() { - return this.getParagraph(firstParagraphNr); - } - - /** - * Sets the language element of a parsed page. - * - * @param languages A ContentElement containg the languages of a page. - */ - public void setLanguagesElement(ContentElement languages) { - this.languages = languages; - } - - /** - * Returns a ContentElement containing the languages that are linked inside the article. - * - * @return A ContentElement containing the languages that are linked inside the article. - */ - public ContentElement getLanguagesElement() { - return languages; - } - - /** - * Returns a list of language Link objects. - * This is a shortcut for writing getLanguagesElement().getLinks(); - */ - public List<Link> getLanguages() { - return languages.getLinks(); - } - -//// I do not think that this should be a core api method, as it is language and template dependend. (TZ) -// /** -// * Returns a ContentElement with the Content of "Dieser Artikel" Template -// */ -// public ContentElement aboutArticle(){ -// return aboutArticle; -// } -// -// /** -// * See aboutArticle() for Details... -// */ -// public void setAboutArticle(ContentElement aboutArticle){ -// this.aboutArticle = aboutArticle; -// } - - - /** - * Sets the Sections of a ParsedPage. - * - * @param sections A list of sections. - */ - public void setSections(List<Section> sections) { - for (Section s : sections) superSection.addSection(s); - } - - /** - * Set the Sections of the ParsedPage.<br> - * This function is used to upgrade a SectionContainer to a ParsedPage. - * - * @param s A sectionContainer. - */ - public void setSections(SectionContainer s) { - superSection = s; - } - -// TODO What means lowest level? => TZ: I think it means "highest" semantically and "lowest" in numbering (e.g. <h1>). - - /** - * Returns the requested Section of the lowest level. - * - * @param i The number of the section. - * @return The section with number i. - */ - public Section getSection(int i) { - return superSection.getSubSection(i); - } - - /** - * Retruns a list of all Sections of the lowest level. - * - * @return A list of sections. - */ - public List<Section> getSections() { - return superSection.getSubSections(); - } - - /* - * Returns pageId and name in a String - */ - public String toString() { - return "ParsedPage " + pageId + " " + name; - } - - /** - * Returns the number of paragraphs. - * - * @return The number of paragraphs. - */ - public int nrOfParagraphs() { - return superSection.nrOfParagraphs(); - } - - /** - * Returns the paragraph indicated by the parameter i. - * - * @param i The number of the paragraph to return. - * @return The paragraph with number i. - */ - public Paragraph getParagraph(int i) { - return superSection.getParagraph(i); - } - - /** - * Returns a list of paragraphs. - * - * @return A list of paragraphs. - */ - public List<Paragraph> getParagraphs() { - return superSection.getParagraphs(); - } - - /** - * Returns the number of tables. - * - * @return The number of tables. - */ - public int nrOfTables() { - return superSection.nrOfTables(); - } - - /** - * Returns the table indicated by the parameter i. - * - * @param i The number of the table to return. - * @return The table with number i. - */ - public Table getTable(int i) { - return superSection.getTable(i); - } - - /** - * Returns a list of tables. - * - * @return A list of tables. - */ - public List<Table> getTables() { - return superSection.getTables(); - } - - /** - * Returns the number of nested lists. - * - * @return The number of nested lists. - */ - public int nrOfNestedLists() { - return superSection.nrOfNestedLists(); - } - - /** - * Returns the nested list indicated by the parameter i. - * - * @param i The number of the nested list to return. - * @return The nested list with number i. - */ - public NestedList getNestedList(int i) { - return superSection.getNestedList(i); - } - - /** - * Returns a list of nested lists. - * - * @return A list of nested lists. - */ - public List<NestedListContainer> getNestedLists() { - return superSection.getNestedLists(); - } - - /** - * Returns the number of definition lists. - * - * @return The number of definition lists. - */ - public int nrOfDefinitionLists() { - return superSection.nrOfDefinitionLists(); - } - - /** - * Returns the definition list indicated by the parameter i. - * - * @param i The number of the definition list to return. - * @return The definition list with number i. - */ - public DefinitionList getDefinitionList(int i) { - return superSection.getDefinitionList(i); - } - - /** - * Returns a list of definition lists. - * - * @return A list of definition lists. - */ - public List<DefinitionList> getDefinitionLists() { - return superSection.getDefinitionLists(); - } - - /** - * Return the plain text. - * - * @return The plain text. - */ - public String getText() { - return superSection.getText(); - } - -//// TODO we should not need that as we could call getText on the span itself. -// /** -// * Look at the SAME function in SectionContainer for Details... -// */ -// public String getText( List<Span> sl ){ return superSection.getText( sl ); } - - - /** - * Returns the length of the text in characters. - * - * @return The length of the text in characters. - */ - public int length() { - return superSection.length(); - } - - public List<FormatType> getFormats() { - return superSection.getFormats(); - } - -////I do not know what these are for and they are never used (TZ). -// public List<FormatType> getFormats(int begin, int end){ return superSection.getFormats(begin,end); } -// public List<FormatType> getFormats(Span s){ return superSection.getFormats(s); } - - public List<Span> getFormatSpans(FormatType t) { - return superSection.getFormatSpans(t); - } - -////I do not know what these are for and they are never used (TZ). -// public List<Span> getFormatSpans(FormatType t, int start, int end ){ return superSection.getFormatSpans(t, start, end); } -// public List<Span> getFormatSpans(FormatType t, Span s){ return superSection.getFormatSpans(t, s); } - - - public List<Link> getLinks() { - return superSection.getLinks(); - } - -////I do not know what these are for and they are never used (TZ). -// public List<Link> getLinks(Link.type t){ return superSection.getLinks(t); } -// public List<Link> getLinks(Link.type t, int begin, int end){ return superSection.getLinks(t, begin, end); } -// public List<Link> getLinks(Link.type t, Span s){ return superSection.getLinks(t, s); } - - /** - * Returns a list of templates that are used in the page. - * - * @return A list of templates that are used in the page. - */ - public List<Template> getTemplates() { - return superSection.getTemplates(); - } - -//// I do not know what these are for and they are never used (TZ). -// public List<Template> getTemplates(int start, int end){ return superSection.getTemplates(start, end); } -// public List<Template> getTemplates(Span s){ return superSection.getTemplates(s); } +public class ParsedPage +{ + + private String name; + private int pageId; + private SectionContainer superSection; + private ContentElement categories; + private ContentElement languages; + + private int firstParagraphNr; + // private ContentElement aboutArticle; + + /** + * Constructor for a blank ParsedPage.<br> + * Only needed, if you want to create a Wikipedia article from scratch. + * <p> + * Creating a ParsedPage from a Wikipedia article requires to create a parser object first. + */ + public ParsedPage() + { + this.superSection = new SectionContainer(null, 0); + } + + /** + * Sets the name of a parsed page. + * + * @param name + * A name for the parsed page. + */ + public void setName(String name) + { + this.name = name; + } + + /** + * The name of a parsed page. + * + * @return The name of a parsed page. + */ + public String getName() + { + return name; + } + + /** + * Set the pageId of a parsed page. + * + * @param pageId + * A pageId for the parsed page. + */ + public void setPageId(int pageId) + { + this.pageId = pageId; + } + + /** + * The pageId of a parsed page. + * + * @return The pageId of a parsed page. + */ + public int getPageId() + { + return pageId; + } + + /** + * Sets the category element of a parsed page. + * + * @param categories + * A ContentElement containg the categories of a page. + */ + public void setCategoryElement(ContentElement categories) + { + this.categories = categories; + } + + /** + * The ContentElement with links to a page's categories. + * + * @return The ContentElement with links to a page's categories. + */ + public ContentElement getCategoryElement() + { + return this.categories; + } + + /** + * Returns a list of category Link objects. This is a shortcut for writing + * getCategoryElemement.getLinks(); + * + * @return A list of category links. + */ + public List<Link> getCategories() + { + if (categories == null) { + return new ArrayList<>(); + } + + return categories.getLinks(); + } + + /** + * Sets the number of the first paragraph. + * + * @param nr + * The number of the first paragraph. + */ + public void setFirstParagraphNr(int nr) + { + this.firstParagraphNr = nr; + } + + /** + * Returns the number of the first paragraph. + * + * @return The number of the first paragraph. + */ + public int getFirstParagraphNr() + { + return firstParagraphNr; + } + + /** + * Returns the first paragraph.<br> + * This is a shortcut for getParagraph( getFirstParagraphNr() ). It is <b>not</b> the same as + * getParagraph( 0 ), because the physically first paragraph often contain tables etc. + */ + public Paragraph getFirstParagraph() + { + return this.getParagraph(firstParagraphNr); + } + + /** + * Sets the language element of a parsed page. + * + * @param languages + * A ContentElement containg the languages of a page. + */ + public void setLanguagesElement(ContentElement languages) + { + this.languages = languages; + } + + /** + * Returns a ContentElement containing the languages that are linked inside the article. + * + * @return A ContentElement containing the languages that are linked inside the article. + */ + public ContentElement getLanguagesElement() + { + return languages; + } + + /** + * Returns a list of language Link objects. This is a shortcut for writing + * getLanguagesElement().getLinks(); + */ + public List<Link> getLanguages() + { + return languages.getLinks(); + } + + //// I do not think that this should be a core api method, as it is language and template + //// dependend. (TZ) + // /** + // * Returns a ContentElement with the Content of "Dieser Artikel" Template + // */ + // public ContentElement aboutArticle(){ + // return aboutArticle; + // } + // + // /** + // * See aboutArticle() for Details... + // */ + // public void setAboutArticle(ContentElement aboutArticle){ + // this.aboutArticle = aboutArticle; + // } + + /** + * Sets the Sections of a ParsedPage. + * + * @param sections + * A list of sections. + */ + public void setSections(List<Section> sections) + { + for (Section s : sections) + superSection.addSection(s); + } + + /** + * Set the Sections of the ParsedPage.<br> + * This function is used to upgrade a SectionContainer to a ParsedPage. + * + * @param s + * A sectionContainer. + */ + public void setSections(SectionContainer s) + { + superSection = s; + } + + // TODO What means lowest level? => TZ: I think it means "highest" semantically and "lowest" in + // numbering (e.g. <h1>). + + /** + * Returns the requested Section of the lowest level. + * + * @param i + * The number of the section. + * @return The section with number i. + */ + public Section getSection(int i) + { + return superSection.getSubSection(i); + } + + /** + * Retruns a list of all Sections of the lowest level. + * + * @return A list of sections. + */ + public List<Section> getSections() + { + return superSection.getSubSections(); + } + + /* + * Returns pageId and name in a String + */ + public String toString() + { + return "ParsedPage " + pageId + " " + name; + } + + /** + * Returns the number of paragraphs. + * + * @return The number of paragraphs. + */ + public int nrOfParagraphs() + { + return superSection.nrOfParagraphs(); + } + + /** + * Returns the paragraph indicated by the parameter i. + * + * @param i + * The number of the paragraph to return. + * @return The paragraph with number i. + */ + public Paragraph getParagraph(int i) + { + return superSection.getParagraph(i); + } + + /** + * Returns a list of paragraphs. + * + * @return A list of paragraphs. + */ + public List<Paragraph> getParagraphs() + { + return superSection.getParagraphs(); + } + + /** + * Returns the number of tables. + * + * @return The number of tables. + */ + public int nrOfTables() + { + return superSection.nrOfTables(); + } + + /** + * Returns the table indicated by the parameter i. + * + * @param i + * The number of the table to return. + * @return The table with number i. + */ + public Table getTable(int i) + { + return superSection.getTable(i); + } + + /** + * Returns a list of tables. + * + * @return A list of tables. + */ + public List<Table> getTables() + { + return superSection.getTables(); + } + + /** + * Returns the number of nested lists. + * + * @return The number of nested lists. + */ + public int nrOfNestedLists() + { + return superSection.nrOfNestedLists(); + } + + /** + * Returns the nested list indicated by the parameter i. + * + * @param i + * The number of the nested list to return. + * @return The nested list with number i. + */ + public NestedList getNestedList(int i) + { + return superSection.getNestedList(i); + } + + /** + * Returns a list of nested lists. + * + * @return A list of nested lists. + */ + public List<NestedListContainer> getNestedLists() + { + return superSection.getNestedLists(); + } + + /** + * Returns the number of definition lists. + * + * @return The number of definition lists. + */ + public int nrOfDefinitionLists() + { + return superSection.nrOfDefinitionLists(); + } + + /** + * Returns the definition list indicated by the parameter i. + * + * @param i + * The number of the definition list to return. + * @return The definition list with number i. + */ + public DefinitionList getDefinitionList(int i) + { + return superSection.getDefinitionList(i); + } + + /** + * Returns a list of definition lists. + * + * @return A list of definition lists. + */ + public List<DefinitionList> getDefinitionLists() + { + return superSection.getDefinitionLists(); + } + + /** + * Return the plain text. + * + * @return The plain text. + */ + public String getText() + { + return superSection.getText(); + } + + //// TODO we should not need that as we could call getText on the span itself. + // /** + // * Look at the SAME function in SectionContainer for Details... + // */ + // public String getText( List<Span> sl ){ return superSection.getText( sl ); } + + /** + * Returns the length of the text in characters. + * + * @return The length of the text in characters. + */ + public int length() + { + return superSection.length(); + } + + public List<FormatType> getFormats() + { + return superSection.getFormats(); + } + + //// I do not know what these are for and they are never used (TZ). + // public List<FormatType> getFormats(int begin, int end){ return + //// superSection.getFormats(begin,end); } + // public List<FormatType> getFormats(Span s){ return superSection.getFormats(s); } + + public List<Span> getFormatSpans(FormatType t) + { + return superSection.getFormatSpans(t); + } + + //// I do not know what these are for and they are never used (TZ). + // public List<Span> getFormatSpans(FormatType t, int start, int end ){ return + //// superSection.getFormatSpans(t, start, end); } + // public List<Span> getFormatSpans(FormatType t, Span s){ return superSection.getFormatSpans(t, + //// s); } + + public List<Link> getLinks() + { + return superSection.getLinks(); + } + + //// I do not know what these are for and they are never used (TZ). + // public List<Link> getLinks(Link.type t){ return superSection.getLinks(t); } + // public List<Link> getLinks(Link.type t, int begin, int end){ return superSection.getLinks(t, + //// begin, end); } + // public List<Link> getLinks(Link.type t, Span s){ return superSection.getLinks(t, s); } + + /** + * Returns a list of templates that are used in the page. + * + * @return A list of templates that are used in the page. + */ + public List<Template> getTemplates() + { + return superSection.getTemplates(); + } + + //// I do not know what these are for and they are never used (TZ). + // public List<Template> getTemplates(int start, int end){ return + //// superSection.getTemplates(start, end); } + // public List<Template> getTemplates(Span s){ return superSection.getTemplates(s); } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java index 9c13b3c3..4e08ec1c 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/ParsedPageObject.java @@ -18,21 +18,23 @@ package org.dkpro.jwpl.parser; /** - * All clases in parsedpage package, which can be created by a - * parser, extending this class. So it is possible for these - * classes to refer to a SourceCode. + * All clases in parsedpage package, which can be created by a parser, extending this class. So it + * is possible for these classes to refer to a SourceCode. */ -public abstract class ParsedPageObject { - private SrcSpan srcSpan; +public abstract class ParsedPageObject +{ + private SrcSpan srcSpan; - /** - * Returns a Span refering to a SourceCode. - */ - public SrcSpan getSrcSpan() { - return srcSpan; - } + /** + * Returns a Span refering to a SourceCode. + */ + public SrcSpan getSrcSpan() + { + return srcSpan; + } - public void setSrcSpan(SrcSpan srcSpan) { - this.srcSpan = srcSpan; - } + public void setSrcSpan(SrcSpan srcSpan) + { + this.srcSpan = srcSpan; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java index 8514fc00..a4ec1360 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Section.java @@ -21,143 +21,153 @@ import java.util.List; /** - * A Section consists at first of a Title. In MediaWiki a Title can contain - * e.g. Links, Images or ItalicText. Therefore a simple ContentElement - * is used as Section Title.<br> + * A Section consists at first of a Title. In MediaWiki a Title can contain e.g. Links, Images or + * ItalicText. Therefore a simple ContentElement is used as Section Title.<br> * The next Point is the hirachical Section Level, which every Section has.<br> * <br> - * Further, a Section can contain other Sections or Content, but not - * both. This is a difference between the API and MediaWiki. In return, the - * accsess to the Elements is possible with just a few functions. This fact - * makes the accest to the provieded Structures very simple.<br> + * Further, a Section can contain other Sections or Content, but not both. This is a difference + * between the API and MediaWiki. In return, the accsess to the Elements is possible with just a few + * functions. This fact makes the accest to the provieded Structures very simple.<br> * <br> * These structure requirements are implemented as SectionContainer an SectionContent. */ -public abstract class Section extends ContentContainer { - - private int level; - private ContentElement title; - - public Section(ContentElement title, int level) { - this.ccl = new ArrayList<>(); - this.level = level; - this.title = title; - if (title != null) ccl.add(title); - } - - /** - * Look at getLevel() for Details... - */ - public void setLevel(int level) { - this.level = level; - } - - /** - * Retruns the hirachical Level of this Section. - */ - public int getLevel() { - return level; - } - - /** - * Returns getTitleElement().getText() without NullPointerException - */ - public String getTitle() { - if (title != null) - return title.getText(); - else - return null; - } - - /** - * Look at getTitleElement() for Details... - */ - public void setTitleElement(ContentElement title) { - if (title != null) { - if (this.title == null) ccl.add(0, title); - else ccl.set(0, title); - } else if (this.title != null) ccl.remove(this.title); - - this.title = title; - } - - /** - * Returns a ContentElement representing the content, originally given as - * MediaWiki SourcCode, beween one ore more equality chars at the beginning - * of a line. This is known as Title. - */ - public ContentElement getTitleElement() { - return title; - } - - /** - * Return a List with all Content of any Type in Order of appearance. - */ - public abstract List<Content> getContentList(); - - /** - * Returns the Number of Paragraphs in this Section. - */ - public abstract int nrOfParagraphs(); - - /** - * Returns the i-th Paragraph of this Section. - */ - public abstract Paragraph getParagraph(int i); - - /** - * Retuns a List of all Paragraphs of this Section. - */ - public abstract List<Paragraph> getParagraphs(); - - /** - * Returns the Number of Tables of this Section. - */ - public abstract int nrOfTables(); - - /** - * Returns the i-th Table of this Section. - */ - public abstract Table getTable(int i); - - /** - * Returns a List of all Tables of this Section. - */ - public abstract List<Table> getTables(); - - /** - * Returns the Number of NestedLists of this Section. - */ - public abstract int nrOfNestedLists(); - - /** - * Returns the i-th NestedList of this Section as NestedListContainer. - */ - public abstract NestedListContainer getNestedList(int i); - - /** - * Returns a List of all NestedLists of this Section. - */ - public abstract List<NestedListContainer> getNestedLists(); - - /** - * Returns the Number of DefinitionLists of this Section. - */ - public abstract int nrOfDefinitionLists(); - - /** - * Returns the i-th Table of this Section. - */ - public abstract DefinitionList getDefinitionList(int i); - - /** - * Returns a List of all DefinitionLists of this Section. - */ - public abstract List<DefinitionList> getDefinitionLists(); - - /** - * Returns a sequence of Chars followed by ZERO. - * For easy handling the result is of the Type String. - */ - public abstract String toString(); +public abstract class Section + extends ContentContainer +{ + + private int level; + private ContentElement title; + + public Section(ContentElement title, int level) + { + this.ccl = new ArrayList<>(); + this.level = level; + this.title = title; + if (title != null) + ccl.add(title); + } + + /** + * Look at getLevel() for Details... + */ + public void setLevel(int level) + { + this.level = level; + } + + /** + * Retruns the hirachical Level of this Section. + */ + public int getLevel() + { + return level; + } + + /** + * Returns getTitleElement().getText() without NullPointerException + */ + public String getTitle() + { + if (title != null) + return title.getText(); + else + return null; + } + + /** + * Look at getTitleElement() for Details... + */ + public void setTitleElement(ContentElement title) + { + if (title != null) { + if (this.title == null) + ccl.add(0, title); + else + ccl.set(0, title); + } + else if (this.title != null) + ccl.remove(this.title); + + this.title = title; + } + + /** + * Returns a ContentElement representing the content, originally given as MediaWiki SourcCode, + * beween one ore more equality chars at the beginning of a line. This is known as Title. + */ + public ContentElement getTitleElement() + { + return title; + } + + /** + * Return a List with all Content of any Type in Order of appearance. + */ + public abstract List<Content> getContentList(); + + /** + * Returns the Number of Paragraphs in this Section. + */ + public abstract int nrOfParagraphs(); + + /** + * Returns the i-th Paragraph of this Section. + */ + public abstract Paragraph getParagraph(int i); + + /** + * Retuns a List of all Paragraphs of this Section. + */ + public abstract List<Paragraph> getParagraphs(); + + /** + * Returns the Number of Tables of this Section. + */ + public abstract int nrOfTables(); + + /** + * Returns the i-th Table of this Section. + */ + public abstract Table getTable(int i); + + /** + * Returns a List of all Tables of this Section. + */ + public abstract List<Table> getTables(); + + /** + * Returns the Number of NestedLists of this Section. + */ + public abstract int nrOfNestedLists(); + + /** + * Returns the i-th NestedList of this Section as NestedListContainer. + */ + public abstract NestedListContainer getNestedList(int i); + + /** + * Returns a List of all NestedLists of this Section. + */ + public abstract List<NestedListContainer> getNestedLists(); + + /** + * Returns the Number of DefinitionLists of this Section. + */ + public abstract int nrOfDefinitionLists(); + + /** + * Returns the i-th Table of this Section. + */ + public abstract DefinitionList getDefinitionList(int i); + + /** + * Returns a List of all DefinitionLists of this Section. + */ + public abstract List<DefinitionList> getDefinitionLists(); + + /** + * Returns a sequence of Chars followed by ZERO. For easy handling the result is of the Type + * String. + */ + public abstract String toString(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java index 0b07b944..9085eaa4 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContainer.java @@ -22,170 +22,209 @@ /** * This is the structure implementation of Section.<br> - * A SectionContainer contains SubSections of type Section, which - * can be either, a SectionContent or anoter SectionContainer.<br> - * For a description of the inherited functions of Section, take a - * look at the Documentation of Section. + * A SectionContainer contains SubSections of type Section, which can be either, a SectionContent or + * anoter SectionContainer.<br> + * For a description of the inherited functions of Section, take a look at the Documentation of + * Section. */ -public class SectionContainer extends Section { - - private final List<Section> sections; - - public SectionContainer(int level) { - super(null, level); - sections = new ArrayList<>(); - } - - public SectionContainer(ContentElement title, int level) { - super(title, level); - sections = new ArrayList<>(); - } - - /** - * Returns the Number of SubSection of this Section. - */ - public int nrOfSubSections() { - return sections.size(); - } - - /** - * Adds a SubSection after the last SubSection. - */ - public void addSection(Section s) { - sections.add(s); - ccl.add(s); - } - - /** - * Removes the specified Section. - */ - public void removeSection(Section s) { - sections.remove(s); - ccl.remove(s); - } - - /** - * Returns the i�th SubSection of this Section. - */ - public Section getSubSection(int i) { - if (sections.size() > i) return sections.get(i); - else return null; - } - - /** - * Returns a List of all SubSections of the next level. - */ - public List<Section> getSubSections() { - return new ArrayList<>(sections); - } - - /* (non-Javadoc) - * @see org.tud.ukp.wikipedia.api.pageparser.Section#getContentList() - */ - public List<Content> getContentList() { - return new ArrayList<>(ccl); - } - - public int nrOfParagraphs() { - int result = 0; - for (Section s : sections) result += s.nrOfParagraphs(); - return result; - } - - public Paragraph getParagraph(int i) { - int nr; - int offset = 0; - for (Section s : sections) { - nr = s.nrOfParagraphs(); - if (nr + offset > i) return s.getParagraph(i - offset); - offset += nr; - } - return null; - } - - public List<Paragraph> getParagraphs() { - List<Paragraph> result = new ArrayList<>(); - for (Section s : sections) result.addAll(s.getParagraphs()); - return result; - } - - public int nrOfTables() { - int result = 0; - for (Section s : sections) result += s.nrOfTables(); - return result; - } - - public Table getTable(int i) { - int nr; - int offset = 0; - for (Section s : sections) { - nr = s.nrOfTables(); - if (nr + offset > i) return s.getTable(i - offset); - offset += nr; - } - return null; - } - - public List<Table> getTables() { - List<Table> result = new ArrayList<>(); - for (Section s : sections) result.addAll(s.getTables()); - return result; - } - - public int nrOfNestedLists() { - int result = 0; - for (Section s : sections) result += s.nrOfNestedLists(); - return result; - } - - public NestedListContainer getNestedList(int i) { - int nr; - int offset = 0; - for (Section s : sections) { - nr = s.nrOfNestedLists(); - if (nr + offset > i) return s.getNestedList(i - offset); - offset += nr; - } - return null; - } - - public List<NestedListContainer> getNestedLists() { - List<NestedListContainer> result = new ArrayList<>(); - for (Section s : sections) result.addAll(s.getNestedLists()); - return result; - } - - public int nrOfDefinitionLists() { - int result = 0; - for (Section s : sections) result += s.nrOfDefinitionLists(); - return result; - } - - public DefinitionList getDefinitionList(int i) { - int nr; - int offset = 0; - for (Section s : sections) { - nr = s.nrOfDefinitionLists(); - if (nr + offset > i) return s.getDefinitionList(i - offset); - offset += nr; - } - return null; - } - - public List<DefinitionList> getDefinitionLists() { - List<DefinitionList> result = new ArrayList<>(); - for (Section s : sections) result.addAll(s.getDefinitionLists()); - return result; - } - - public String toString() { - StringBuilder result = new StringBuilder(); - result.append("SS_TITLE:\n" + this.getTitleElement()); - result.append("\nSS_LEVEL: " + this.getLevel()); - result.append("\nSS_SUBSECTIONS: " + sections.size()); - for (Section s : sections) - result.append("\nSS_SUBSECTION:\n" + s.toString()); - - return result.toString(); - } +public class SectionContainer + extends Section +{ + + private final List<Section> sections; + + public SectionContainer(int level) + { + super(null, level); + sections = new ArrayList<>(); + } + + public SectionContainer(ContentElement title, int level) + { + super(title, level); + sections = new ArrayList<>(); + } + + /** + * Returns the Number of SubSection of this Section. + */ + public int nrOfSubSections() + { + return sections.size(); + } + + /** + * Adds a SubSection after the last SubSection. + */ + public void addSection(Section s) + { + sections.add(s); + ccl.add(s); + } + + /** + * Removes the specified Section. + */ + public void removeSection(Section s) + { + sections.remove(s); + ccl.remove(s); + } + + /** + * Returns the i�th SubSection of this Section. + */ + public Section getSubSection(int i) + { + if (sections.size() > i) + return sections.get(i); + else + return null; + } + + /** + * Returns a List of all SubSections of the next level. + */ + public List<Section> getSubSections() + { + return new ArrayList<>(sections); + } + + /* + * (non-Javadoc) + * + * @see org.tud.ukp.wikipedia.api.pageparser.Section#getContentList() + */ + public List<Content> getContentList() + { + return new ArrayList<>(ccl); + } + + public int nrOfParagraphs() + { + int result = 0; + for (Section s : sections) + result += s.nrOfParagraphs(); + return result; + } + + public Paragraph getParagraph(int i) + { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfParagraphs(); + if (nr + offset > i) + return s.getParagraph(i - offset); + offset += nr; + } + return null; + } + + public List<Paragraph> getParagraphs() + { + List<Paragraph> result = new ArrayList<>(); + for (Section s : sections) + result.addAll(s.getParagraphs()); + return result; + } + + public int nrOfTables() + { + int result = 0; + for (Section s : sections) + result += s.nrOfTables(); + return result; + } + + public Table getTable(int i) + { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfTables(); + if (nr + offset > i) + return s.getTable(i - offset); + offset += nr; + } + return null; + } + + public List<Table> getTables() + { + List<Table> result = new ArrayList<>(); + for (Section s : sections) + result.addAll(s.getTables()); + return result; + } + + public int nrOfNestedLists() + { + int result = 0; + for (Section s : sections) + result += s.nrOfNestedLists(); + return result; + } + + public NestedListContainer getNestedList(int i) + { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfNestedLists(); + if (nr + offset > i) + return s.getNestedList(i - offset); + offset += nr; + } + return null; + } + + public List<NestedListContainer> getNestedLists() + { + List<NestedListContainer> result = new ArrayList<>(); + for (Section s : sections) + result.addAll(s.getNestedLists()); + return result; + } + + public int nrOfDefinitionLists() + { + int result = 0; + for (Section s : sections) + result += s.nrOfDefinitionLists(); + return result; + } + + public DefinitionList getDefinitionList(int i) + { + int nr; + int offset = 0; + for (Section s : sections) { + nr = s.nrOfDefinitionLists(); + if (nr + offset > i) + return s.getDefinitionList(i - offset); + offset += nr; + } + return null; + } + + public List<DefinitionList> getDefinitionLists() + { + List<DefinitionList> result = new ArrayList<>(); + for (Section s : sections) + result.addAll(s.getDefinitionLists()); + return result; + } + + public String toString() + { + StringBuilder result = new StringBuilder(); + result.append("SS_TITLE:\n" + this.getTitleElement()); + result.append("\nSS_LEVEL: " + this.getLevel()); + result.append("\nSS_SUBSECTIONS: " + sections.size()); + for (Section s : sections) + result.append("\nSS_SUBSECTION:\n" + s.toString()); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java index c622106d..789bf23a 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SectionContent.java @@ -23,144 +23,183 @@ /** * This is a Implementation of the Content of a Section.<br> * For every content accsess function exists a setter and a remover function.<br> - * For a description of the inherited functions of Section, take a - * look at the Documentation of Section. + * For a description of the inherited functions of Section, take a look at the Documentation of + * Section. */ -public class SectionContent extends Section { - - private List<Paragraph> paragraphs; - private List<Table> tables; - private List<NestedListContainer> nestedLists; - private List<DefinitionList> definitionLists; - - public SectionContent(int level) { - super(null, level); - init(); - } - - public SectionContent(ContentElement title, int level) { - super(title, level); - init(); - } - - private void init() { - paragraphs = new ArrayList<>(); - tables = new ArrayList<>(); - nestedLists = new ArrayList<>(); - definitionLists = new ArrayList<>(); - } - - public List<Content> getContentList() { - return new ArrayList<>(ccl); - } - - public int nrOfParagraphs() { - return paragraphs.size(); - } - - public void addParagraph(Paragraph p) { - paragraphs.add(p); - ccl.add(p); - } - - public void removeParagraph(Paragraph p) { - paragraphs.remove(p); - ccl.remove(p); - } - - public Paragraph getParagraph(int i) { - if (paragraphs.size() > i) return paragraphs.get(i); - else return null; - } - - public List<Paragraph> getParagraphs() { - return new ArrayList<>(paragraphs); - } - - public int nrOfTables() { - return tables.size(); - } - - public void addTable(Table t) { - tables.add(t); - ccl.add(t); - } - - public void removeTable(Table t) { - tables.remove(t); - ccl.remove(t); - } - - public Table getTable(int i) { - if (tables.size() > i) return tables.get(i); - else return null; - } - - public List<Table> getTables() { - return new ArrayList<>(tables); - } - - public int nrOfNestedLists() { - return nestedLists.size(); - } - - public void addNestedList(NestedListContainer nl) { - nestedLists.add(nl); - ccl.add(nl); - } - - public void removeNestedList(NestedListContainer nl) { - nestedLists.remove(nl); - ccl.remove(nl); - } - - public NestedListContainer getNestedList(int i) { - if (nestedLists.size() > i) return nestedLists.get(i); - else return null; - } - - public List<NestedListContainer> getNestedLists() { - return new ArrayList<>(nestedLists); - } - - public int nrOfDefinitionLists() { - return definitionLists.size(); - } - - public void addDefinitionList(DefinitionList dl) { - definitionLists.add(dl); - ccl.add(dl); - } - - public void removeDefinitionList(DefinitionList dl) { - definitionLists.remove(dl); - ccl.remove(dl); - } - - public DefinitionList getDefinitionList(int i) { - if (definitionLists.size() > i) return definitionLists.get(i); - else return null; - } - - public List<DefinitionList> getDefinitionLists() { - return new ArrayList<>(definitionLists); - } - - public String toString() { - StringBuilder result = new StringBuilder(); - - result.append("SC_TITLE:\n" + this.getTitleElement()); - result.append("\nSC_LEVEL: " + this.getLevel()); - - result.append("\nSC_PARAGRAPHS: " + paragraphs.size()); - for (Paragraph p : paragraphs) result.append("\nSC_PARAGRAPH:\n" + p); - result.append("\nSC_TABLES: " + tables.size()); - for (Table t : tables) result.append("\nSC_TABLE:\n" + t); - result.append("\nSC_NESTED_LISTS: " + nestedLists.size()); - for (NestedList nl : nestedLists) result.append("\nSC_NESTED_LIST:\n" + nl); - result.append("\nSC_DEFINITON_LISTS: " + definitionLists.size()); - for (DefinitionList dl : definitionLists) result.append("\nSC_DEFINITION_LIST:\n" + dl); - - return result.toString(); - } +public class SectionContent + extends Section +{ + + private List<Paragraph> paragraphs; + private List<Table> tables; + private List<NestedListContainer> nestedLists; + private List<DefinitionList> definitionLists; + + public SectionContent(int level) + { + super(null, level); + init(); + } + + public SectionContent(ContentElement title, int level) + { + super(title, level); + init(); + } + + private void init() + { + paragraphs = new ArrayList<>(); + tables = new ArrayList<>(); + nestedLists = new ArrayList<>(); + definitionLists = new ArrayList<>(); + } + + public List<Content> getContentList() + { + return new ArrayList<>(ccl); + } + + public int nrOfParagraphs() + { + return paragraphs.size(); + } + + public void addParagraph(Paragraph p) + { + paragraphs.add(p); + ccl.add(p); + } + + public void removeParagraph(Paragraph p) + { + paragraphs.remove(p); + ccl.remove(p); + } + + public Paragraph getParagraph(int i) + { + if (paragraphs.size() > i) + return paragraphs.get(i); + else + return null; + } + + public List<Paragraph> getParagraphs() + { + return new ArrayList<>(paragraphs); + } + + public int nrOfTables() + { + return tables.size(); + } + + public void addTable(Table t) + { + tables.add(t); + ccl.add(t); + } + + public void removeTable(Table t) + { + tables.remove(t); + ccl.remove(t); + } + + public Table getTable(int i) + { + if (tables.size() > i) + return tables.get(i); + else + return null; + } + + public List<Table> getTables() + { + return new ArrayList<>(tables); + } + + public int nrOfNestedLists() + { + return nestedLists.size(); + } + + public void addNestedList(NestedListContainer nl) + { + nestedLists.add(nl); + ccl.add(nl); + } + + public void removeNestedList(NestedListContainer nl) + { + nestedLists.remove(nl); + ccl.remove(nl); + } + + public NestedListContainer getNestedList(int i) + { + if (nestedLists.size() > i) + return nestedLists.get(i); + else + return null; + } + + public List<NestedListContainer> getNestedLists() + { + return new ArrayList<>(nestedLists); + } + + public int nrOfDefinitionLists() + { + return definitionLists.size(); + } + + public void addDefinitionList(DefinitionList dl) + { + definitionLists.add(dl); + ccl.add(dl); + } + + public void removeDefinitionList(DefinitionList dl) + { + definitionLists.remove(dl); + ccl.remove(dl); + } + + public DefinitionList getDefinitionList(int i) + { + if (definitionLists.size() > i) + return definitionLists.get(i); + else + return null; + } + + public List<DefinitionList> getDefinitionLists() + { + return new ArrayList<>(definitionLists); + } + + public String toString() + { + StringBuilder result = new StringBuilder(); + + result.append("SC_TITLE:\n" + this.getTitleElement()); + result.append("\nSC_LEVEL: " + this.getLevel()); + + result.append("\nSC_PARAGRAPHS: " + paragraphs.size()); + for (Paragraph p : paragraphs) + result.append("\nSC_PARAGRAPH:\n" + p); + result.append("\nSC_TABLES: " + tables.size()); + for (Table t : tables) + result.append("\nSC_TABLE:\n" + t); + result.append("\nSC_NESTED_LISTS: " + nestedLists.size()); + for (NestedList nl : nestedLists) + result.append("\nSC_NESTED_LIST:\n" + nl); + result.append("\nSC_DEFINITON_LISTS: " + definitionLists.size()); + for (DefinitionList dl : definitionLists) + result.append("\nSC_DEFINITION_LIST:\n" + dl); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java index 8ccec238..e5e6e5b6 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Span.java @@ -20,155 +20,187 @@ /** * Provides a Start and End Position... */ -public class Span extends ParsedPageObject { - - private int start; - private int end; - - public Span(int start, int end) { - this.start = start; - this.end = end; - } - - public int getStart() { - return start; - } - - public Span setStart(int start) { - this.start = start; - return this; - } - - public Span adjustStart(int n) { - start += n; - return this; - } - - public int getEnd() { - return end; - } - - public Span setEnd(int end) { - this.end = end; - return this; - } - - public Span adjustEnd(int n) { - end += n; - return this; - } - - public Span adjust(int n) { - return adjust(0, n); - } - - /** - * Adjusts the start and end Position of the Span, if they are - * larger than the offset. - */ - public Span adjust(int offset, int n) { - if (offset < 0) return this; //null - - if (offset < end) { - end += n; - if (end < offset) end = offset; - } else return this; //null - - if (offset < start) { - start += n; - if (start < offset) start = offset; - } - return this; - } - - public boolean equals(int start, int end) { - return ((this.start == start) && (this.end == end)); - } - - public boolean equals(Span s) { - return ((this.start == s.getStart()) && (this.end == s.getEnd())); - } - - /** - * returns true if this Span is in the range of the Span s. - */ - public boolean hits(Span s) { - return start < s.getEnd() && s.getStart() < end; - } - - public String toString() { - return "(" + start + ", " + end + ")"; - } - - /** - * simply src.substring( this.getStart(), this.getEnd ); - */ - public String getText(String src) { - if (end > src.length()) { - end = src.length(); - } - return src.substring(start, end); - } - - /** - * A defined ErrorChar which will be returnd when an error occures.<br> - * An ErrorChar seems to be more easy to handle than e.g. an IndexOutOfBoundsException. - */ - public static final char ERRORCHAR = 0; - - public char charAt(int pos, CharSequence cs) { - if (pos + start < end) return cs.charAt(start + pos); - else return ERRORCHAR; - } - - public int nonWSCharPos(CharSequence cs) { - int pos = 0; - while (charAt(pos, cs) == ' ') pos++; - return pos; - } - - /** - * Returns the Span, with trailing whitespaces omitted. - */ - public Span trimTrail(CharSequence src) { - if (start < end) { - while (src.charAt(end - 1) == 32) { - end--; - if (start == end) break; - } - } - return this; - } - - /** - * Returns the Span, with leading and trailing whitespaces omitted. - */ - public Span trim(CharSequence src) { - if (start < end) - while (src.charAt(end - 1) == 32) { - end--; - if (start == end) break; - } - - if (start < end) - while (src.charAt(start) == 32) { - start++; - if (start == end) break; - } - - return this; - } - - /** - * returns this.getEnd()-this.getStart() - */ - public int length() { - return end - start; - } - - public Span clone() { - Span result = new Span(start, end); - result.setSrcSpan(this.getSrcSpan()); - return result; - } +public class Span + extends ParsedPageObject +{ + + private int start; + private int end; + + public Span(int start, int end) + { + this.start = start; + this.end = end; + } + + public int getStart() + { + return start; + } + + public Span setStart(int start) + { + this.start = start; + return this; + } + + public Span adjustStart(int n) + { + start += n; + return this; + } + + public int getEnd() + { + return end; + } + + public Span setEnd(int end) + { + this.end = end; + return this; + } + + public Span adjustEnd(int n) + { + end += n; + return this; + } + + public Span adjust(int n) + { + return adjust(0, n); + } + + /** + * Adjusts the start and end Position of the Span, if they are larger than the offset. + */ + public Span adjust(int offset, int n) + { + if (offset < 0) + return this; // null + + if (offset < end) { + end += n; + if (end < offset) + end = offset; + } + else + return this; // null + + if (offset < start) { + start += n; + if (start < offset) + start = offset; + } + return this; + } + + public boolean equals(int start, int end) + { + return ((this.start == start) && (this.end == end)); + } + + public boolean equals(Span s) + { + return ((this.start == s.getStart()) && (this.end == s.getEnd())); + } + + /** + * returns true if this Span is in the range of the Span s. + */ + public boolean hits(Span s) + { + return start < s.getEnd() && s.getStart() < end; + } + + public String toString() + { + return "(" + start + ", " + end + ")"; + } + + /** + * simply src.substring( this.getStart(), this.getEnd ); + */ + public String getText(String src) + { + if (end > src.length()) { + end = src.length(); + } + return src.substring(start, end); + } + + /** + * A defined ErrorChar which will be returnd when an error occures.<br> + * An ErrorChar seems to be more easy to handle than e.g. an IndexOutOfBoundsException. + */ + public static final char ERRORCHAR = 0; + + public char charAt(int pos, CharSequence cs) + { + if (pos + start < end) + return cs.charAt(start + pos); + else + return ERRORCHAR; + } + + public int nonWSCharPos(CharSequence cs) + { + int pos = 0; + while (charAt(pos, cs) == ' ') + pos++; + return pos; + } + + /** + * Returns the Span, with trailing whitespaces omitted. + */ + public Span trimTrail(CharSequence src) + { + if (start < end) { + while (src.charAt(end - 1) == 32) { + end--; + if (start == end) + break; + } + } + return this; + } + + /** + * Returns the Span, with leading and trailing whitespaces omitted. + */ + public Span trim(CharSequence src) + { + if (start < end) + while (src.charAt(end - 1) == 32) { + end--; + if (start == end) + break; + } + + if (start < end) + while (src.charAt(start) == 32) { + start++; + if (start == end) + break; + } + + return this; + } + + /** + * returns this.getEnd()-this.getStart() + */ + public int length() + { + return end - start; + } + + public Span clone() + { + Span result = new Span(start, end); + result.setSrcSpan(this.getSrcSpan()); + return result; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java index 707cab78..a881cde8 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/SrcSpan.java @@ -20,48 +20,57 @@ /** * */ -public class SrcSpan { - private int start; - private int end; +public class SrcSpan +{ + private int start; + private int end; - /** - * @param start is the startposition of the Object in the original MediaWikiSource - * @param end is the endposition of the Object in the original MediaWikiSource - */ - public SrcSpan(int start, int end) { - this.start = start; - this.end = end; - } + /** + * @param start + * is the startposition of the Object in the original MediaWikiSource + * @param end + * is the endposition of the Object in the original MediaWikiSource + */ + public SrcSpan(int start, int end) + { + this.start = start; + this.end = end; + } - /** - * Look at Constructor for Details... - */ - public int getEnd() { - return end; - } + /** + * Look at Constructor for Details... + */ + public int getEnd() + { + return end; + } - /** - * Look at Constructor for Details... - */ - public void setEnd(int end) { - this.end = end; - } + /** + * Look at Constructor for Details... + */ + public void setEnd(int end) + { + this.end = end; + } - /** - * Look at Constructor for Details... - */ - public int getStart() { - return start; - } + /** + * Look at Constructor for Details... + */ + public int getStart() + { + return start; + } - /** - * Look at Constructor for Details... - */ - public void setStart(int start) { - this.start = start; - } + /** + * Look at Constructor for Details... + */ + public void setStart(int start) + { + this.start = start; + } - public String toString() { - return "(" + start + ", " + end + ")"; - } + public String toString() + { + return "(" + start + ", " + end + ")"; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java index 33d3aad4..f3410539 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Table.java @@ -22,152 +22,191 @@ /** * A Table has a Title and contains TableElements.<br> - * This Class provides all needed functions simmilar to the other classes in - * this package. + * This Class provides all needed functions simmilar to the other classes in this package. */ -public class Table extends ContentContainer { - - private final List<TableElement> tableElements; - private ContentElement title; - - public Table() { - ccl = new ArrayList<>(); - tableElements = new ArrayList<>(); - } - - public String toString() { - StringBuilder result = new StringBuilder(); - - result.append("TB_TableElements: " + tableElements.size()); - for (TableElement td : tableElements) result.append("\n" + td); - - return result.toString(); - } - - public void addTableElement(TableElement te) { - tableElements.add(te); - ccl.add(te); - } - - public void removeTableElement(TableElement te) { - tableElements.remove(te); - ccl.remove(te); - } - - public TableElement getTableElement(int i) { - return tableElements.get(i); - } - - public ContentElement getTitleElement() { - return this.title; - } - - public void setTitleElement(ContentElement title) { - if (title != null) { - if (this.title == null) ccl.add(0, title); - else ccl.set(0, title); - } else if (this.title != null) ccl.remove(this.title); - - this.title = title; - } - - public int nrOfTableElements() { - return tableElements.size(); - } - - public List<Content> getContentList() { - return new ArrayList<>(ccl); - } - - public int nrOfParagraphs() { - int result = 0; - for (TableElement td : tableElements) result += td.nrOfParagraphs(); - return result; - } - - public Paragraph getParagraph(int i) { - int nr; - int offset = 0; - for (TableElement td : tableElements) { - nr = td.nrOfParagraphs(); - if (nr + offset > i) return td.getParagraph(i - offset); - offset += nr; - } - return null; - } - - public List<Paragraph> getParagraphs() { - List<Paragraph> result = new ArrayList<>(); - for (TableElement td : tableElements) result.addAll(td.getParagraphs()); - return result; - } - - public int nrOfTables() { - int result = 0; - for (TableElement td : tableElements) result += td.nrOfTables(); - return result; - } - - public Table getTable(int i) { - int nr; - int offset = 0; - for (TableElement td : tableElements) { - nr = td.nrOfTables(); - if (nr + offset > i) return td.getTable(i - offset); - offset += nr; - } - return null; - } - - public List<Table> getTables() { - List<Table> result = new ArrayList<>(); - for (TableElement td : tableElements) result.addAll(td.getTables()); - return result; - } - - public int nrOfNestedLists() { - int result = 0; - for (TableElement td : tableElements) result += td.nrOfNestedLists(); - return result; - } - - public NestedList getNestedList(int i) { - int nr; - int offset = 0; - for (TableElement td : tableElements) { - nr = td.nrOfNestedLists(); - if (nr + offset > i) return td.getNestedList(i - offset); - offset += nr; - } - return null; - } - - public List<NestedList> getNestedLists() { - List<NestedList> result = new ArrayList<>(); - for (TableElement td : tableElements) result.addAll(td.getNestedLists()); - return result; - } - - public int nrOfDefinitionLists() { - int result = 0; - for (TableElement td : tableElements) result += td.nrOfDefinitionLists(); - return result; - } - - public DefinitionList getDefinitionList(int i) { - int nr; - int offset = 0; - for (TableElement td : tableElements) { - nr = td.nrOfDefinitionLists(); - if (nr + offset > i) return td.getDefinitionList(i - offset); - offset += nr; - } - return null; - } - - public List<DefinitionList> getDefinitionLists() { - List<DefinitionList> result = new ArrayList<>(); - for (TableElement td : tableElements) result.addAll(td.getDefinitionLists()); - return result; - } +public class Table + extends ContentContainer +{ + + private final List<TableElement> tableElements; + private ContentElement title; + + public Table() + { + ccl = new ArrayList<>(); + tableElements = new ArrayList<>(); + } + + public String toString() + { + StringBuilder result = new StringBuilder(); + + result.append("TB_TableElements: " + tableElements.size()); + for (TableElement td : tableElements) + result.append("\n" + td); + + return result.toString(); + } + + public void addTableElement(TableElement te) + { + tableElements.add(te); + ccl.add(te); + } + + public void removeTableElement(TableElement te) + { + tableElements.remove(te); + ccl.remove(te); + } + + public TableElement getTableElement(int i) + { + return tableElements.get(i); + } + + public ContentElement getTitleElement() + { + return this.title; + } + + public void setTitleElement(ContentElement title) + { + if (title != null) { + if (this.title == null) + ccl.add(0, title); + else + ccl.set(0, title); + } + else if (this.title != null) + ccl.remove(this.title); + + this.title = title; + } + + public int nrOfTableElements() + { + return tableElements.size(); + } + + public List<Content> getContentList() + { + return new ArrayList<>(ccl); + } + + public int nrOfParagraphs() + { + int result = 0; + for (TableElement td : tableElements) + result += td.nrOfParagraphs(); + return result; + } + + public Paragraph getParagraph(int i) + { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfParagraphs(); + if (nr + offset > i) + return td.getParagraph(i - offset); + offset += nr; + } + return null; + } + + public List<Paragraph> getParagraphs() + { + List<Paragraph> result = new ArrayList<>(); + for (TableElement td : tableElements) + result.addAll(td.getParagraphs()); + return result; + } + + public int nrOfTables() + { + int result = 0; + for (TableElement td : tableElements) + result += td.nrOfTables(); + return result; + } + + public Table getTable(int i) + { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfTables(); + if (nr + offset > i) + return td.getTable(i - offset); + offset += nr; + } + return null; + } + + public List<Table> getTables() + { + List<Table> result = new ArrayList<>(); + for (TableElement td : tableElements) + result.addAll(td.getTables()); + return result; + } + + public int nrOfNestedLists() + { + int result = 0; + for (TableElement td : tableElements) + result += td.nrOfNestedLists(); + return result; + } + + public NestedList getNestedList(int i) + { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfNestedLists(); + if (nr + offset > i) + return td.getNestedList(i - offset); + offset += nr; + } + return null; + } + + public List<NestedList> getNestedLists() + { + List<NestedList> result = new ArrayList<>(); + for (TableElement td : tableElements) + result.addAll(td.getNestedLists()); + return result; + } + + public int nrOfDefinitionLists() + { + int result = 0; + for (TableElement td : tableElements) + result += td.nrOfDefinitionLists(); + return result; + } + + public DefinitionList getDefinitionList(int i) + { + int nr; + int offset = 0; + for (TableElement td : tableElements) { + nr = td.nrOfDefinitionLists(); + if (nr + offset > i) + return td.getDefinitionList(i - offset); + offset += nr; + } + return null; + } + + public List<DefinitionList> getDefinitionLists() + { + List<DefinitionList> result = new ArrayList<>(); + for (TableElement td : tableElements) + result.addAll(td.getDefinitionLists()); + return result; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java index dafbb0ae..37e85568 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/TableElement.java @@ -22,103 +22,126 @@ /** * This Class implements a Field in a Table...<br> * it simply has an int for col and row, and a SectionContainer for the Content.<br> - * This implementation is needed, because a Table in MediaWiki can contain neary - * everything. + * This implementation is needed, because a Table in MediaWiki can contain neary everything. */ -public class TableElement extends ContentContainer { - - private final int col; - private final int row; - private final SectionContainer s; - - public TableElement(SectionContainer s, int row, int col) { - this.ccl = s.ccl; - this.s = s; - this.row = row; - this.col = col; - } - - public int getCol() { - return col; - } - - public int getRow() { - return row; - } - - public int nrOfSections() { - return s.nrOfSubSections(); - } - - public Section getSection(int i) { - return s.getSubSection(i); - } - - public void removeSection(Section s) { - this.s.removeSection(s); - } - - public List<Section> getSubSections() { - return s.getSubSections(); - } - - public List<Content> getContentList() { - return s.getContentList(); - } - - public int nrOfParagraphs() { - return s.nrOfParagraphs(); - } - - public Paragraph getParagraph(int i) { - return s.getParagraph(i); - } - - public List<Paragraph> getParagraphs() { - return s.getParagraphs(); - } - - public int nrOfTables() { - return s.nrOfTables(); - } - - public Table getTable(int i) { - return s.getTable(i); - } - - public List<Table> getTables() { - return s.getTables(); - } - - public int nrOfNestedLists() { - return s.nrOfNestedLists(); - } - - public NestedList getNestedList(int i) { - return s.getNestedList(i); - } - - public List<NestedListContainer> getNestedLists() { - return s.getNestedLists(); - } - - public int nrOfDefinitionLists() { - return s.nrOfDefinitionLists(); - } - - public DefinitionList getDefinitionList(int i) { - return s.getDefinitionList(i); - } - - public List<DefinitionList> getDefinitionLists() { - return s.getDefinitionLists(); - } - - public SectionContainer getSectionContainer() { - return s; - } - - public String toString() { - return "TABLE_DATA: \n" + s; - } +public class TableElement + extends ContentContainer +{ + + private final int col; + private final int row; + private final SectionContainer s; + + public TableElement(SectionContainer s, int row, int col) + { + this.ccl = s.ccl; + this.s = s; + this.row = row; + this.col = col; + } + + public int getCol() + { + return col; + } + + public int getRow() + { + return row; + } + + public int nrOfSections() + { + return s.nrOfSubSections(); + } + + public Section getSection(int i) + { + return s.getSubSection(i); + } + + public void removeSection(Section s) + { + this.s.removeSection(s); + } + + public List<Section> getSubSections() + { + return s.getSubSections(); + } + + public List<Content> getContentList() + { + return s.getContentList(); + } + + public int nrOfParagraphs() + { + return s.nrOfParagraphs(); + } + + public Paragraph getParagraph(int i) + { + return s.getParagraph(i); + } + + public List<Paragraph> getParagraphs() + { + return s.getParagraphs(); + } + + public int nrOfTables() + { + return s.nrOfTables(); + } + + public Table getTable(int i) + { + return s.getTable(i); + } + + public List<Table> getTables() + { + return s.getTables(); + } + + public int nrOfNestedLists() + { + return s.nrOfNestedLists(); + } + + public NestedList getNestedList(int i) + { + return s.getNestedList(i); + } + + public List<NestedListContainer> getNestedLists() + { + return s.getNestedLists(); + } + + public int nrOfDefinitionLists() + { + return s.nrOfDefinitionLists(); + } + + public DefinitionList getDefinitionList(int i) + { + return s.getDefinitionList(i); + } + + public List<DefinitionList> getDefinitionLists() + { + return s.getDefinitionLists(); + } + + public SectionContainer getSectionContainer() + { + return s; + } + + public String toString() + { + return "TABLE_DATA: \n" + s; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java index 97642ec4..5f958e6b 100755 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/Template.java @@ -19,57 +19,67 @@ import java.util.List; -public class Template extends ParsedPageObject { +public class Template + extends ParsedPageObject +{ - private Span pos; - private String name; - private List<String> parameters; + private Span pos; + private String name; + private List<String> parameters; - public Template(Span pos, String name, List<String> parameters) { - this.pos = pos; - this.name = name; - this.parameters = parameters; - } + public Template(Span pos, String name, List<String> parameters) + { + this.pos = pos; + this.name = name; + this.parameters = parameters; + } - public String getName() { - return name; - } + public String getName() + { + return name; + } - public void setName(String name) { - this.name = name; - } + public void setName(String name) + { + this.name = name; + } - public List<String> getParameters() { - return parameters; - } + public List<String> getParameters() + { + return parameters; + } - public void setParameters(List<String> parameters) { - this.parameters = parameters; - } + public void setParameters(List<String> parameters) + { + this.parameters = parameters; + } - /** - * Returns the Position Span of this Template refering to the ContentElement - * in which the Template occures. This is mainly the same like Link.getPos(), - * but a Template does�n know it�s HomeElement. - */ - public Span getPos() { - return pos; - } + /** + * Returns the Position Span of this Template refering to the ContentElement in which the + * Template occures. This is mainly the same like Link.getPos(), but a Template does�n know it�s + * HomeElement. + */ + public Span getPos() + { + return pos; + } - /** - * Look at getPos for Details... - */ - public void setPos(Span pos) { - this.pos = pos; - } + /** + * Look at getPos for Details... + */ + public void setPos(Span pos) + { + this.pos = pos; + } - public String toString() { - StringBuilder result = new StringBuilder(); - result.append("TE_NAME: \"" + name + "\""); - result.append("\nTE_PARAMETERS: " + parameters.size()); - for (String parameter : parameters) result.append("\nTE_PARAMETER: \"" + parameter + "\""); - result.append("\nTE_POS: " + pos); - return result.toString(); - } + public String toString() + { + StringBuilder result = new StringBuilder(); + result.append("TE_NAME: \"" + name + "\""); + result.append("\nTE_PARAMETERS: " + parameters.size()); + for (String parameter : parameters) + result.append("\nTE_PARAMETER: \"" + parameter + "\""); + result.append("\nTE_POS: " + pos); + return result.toString(); + } } - diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java index 1b08d20c..f6dc1660 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/HtmlWriter.java @@ -55,440 +55,460 @@ * There is a ParsedPage.css for formatting the HTML Tags.<br> * Look at the {@code T7_HtmlFileDemo.java} in the 'tutorial' module for a better introduction. */ -public class HtmlWriter { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - /** - * Generates HTML Output for a {@link ParsedPage}. - * - * @param pp The page that shall be parsed. - * @return A string containing the HTML rendering of the {@link ParsedPage}. - */ - public static String parsedPageToHtml(ParsedPage pp) { - StringBuilder result = new StringBuilder(); - result.append(getHtmlHeader()); - - if (pp != null) { - //Title - result.append( - "<table class=\"ParsedPage\">\n" + - "<tr><th class=\"ParsedPage\">ParsedPage: \n" + - pp.getName() + - "</th></tr>\n"); - -// if( pp.aboutArticle()!=null ){ -// result.append("<tr><td class=\"ParsedPage\">\n"); -// result.append("About Article:" + contentElementToHtml( pp.aboutArticle() )); -// result.append("</td></tr>\n"); -// } - - //Sections - result.append( - "<tr><td class=\"ParsedPage\">\n"); - for (Section s : pp.getSections()) { - result.append(sectionToHtml(s)); - } - result.append( - "</td></tr>\n"); - - //Categories - if (pp.getCategoryElement() != null) { - result.append("<tr><td class=\"ParsedPage\">\n"); - result.append("Categories:\n" + contentElementToHtml(pp.getCategoryElement())); - result.append("</td></tr>\n"); - } +public class HtmlWriter +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + /** + * Generates HTML Output for a {@link ParsedPage}. + * + * @param pp + * The page that shall be parsed. + * @return A string containing the HTML rendering of the {@link ParsedPage}. + */ + public static String parsedPageToHtml(ParsedPage pp) + { + StringBuilder result = new StringBuilder(); + result.append(getHtmlHeader()); + + if (pp != null) { + // Title + result.append( + "<table class=\"ParsedPage\">\n" + "<tr><th class=\"ParsedPage\">ParsedPage: \n" + + pp.getName() + "</th></tr>\n"); + + // if( pp.aboutArticle()!=null ){ + // result.append("<tr><td class=\"ParsedPage\">\n"); + // result.append("About Article:" + contentElementToHtml( pp.aboutArticle() )); + // result.append("</td></tr>\n"); + // } + + // Sections + result.append("<tr><td class=\"ParsedPage\">\n"); + for (Section s : pp.getSections()) { + result.append(sectionToHtml(s)); + } + result.append("</td></tr>\n"); + + // Categories + if (pp.getCategoryElement() != null) { + result.append("<tr><td class=\"ParsedPage\">\n"); + result.append("Categories:\n" + contentElementToHtml(pp.getCategoryElement())); + result.append("</td></tr>\n"); + } + + // Languages + if (pp.getLanguagesElement() != null) { + result.append("<tr><td class=\"ParsedPage\">\n"); + result.append("Languages:\n" + contentElementToHtml(pp.getLanguagesElement())); + result.append("</td></tr>\n"); + } + + // Finalize + result.append("</table>\n"); + } - //Languages - if (pp.getLanguagesElement() != null) { - result.append("<tr><td class=\"ParsedPage\">\n"); - result.append("Languages:\n" + contentElementToHtml(pp.getLanguagesElement())); - result.append("</td></tr>\n"); - } + result.append(getHtmlFooter()); - //Finalize - result.append("</table>\n"); + return result.toString(); } - result.append(getHtmlFooter()); - - return result.toString(); - } - - /** - * @return Creates and returns the header of the HTML page - */ - private static String getHtmlHeader() { - StringBuilder header = new StringBuilder(); - header.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); - header.append("<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"); - header.append("<html>"); - header.append("<head>"); - header.append(getCSS()); -// header.append(" <link href=\""+cssFileName+"\" type=\"text/css\" rel=\"stylesheet\"/>"); - header.append("</head>"); - header.append("<body>"); - - return header.toString(); - } - - /** - * @return Creates and returns the footer of the HTML page - */ - private static String getHtmlFooter() { - StringBuilder footer = new StringBuilder(); - footer.append("</body>"); - footer.append("</html>"); - - return footer.toString(); - } - - /** - * @return Creates and returns the CSS definitions of the HTML page - */ - private static String getCSS() { - StringBuilder css = new StringBuilder(); - css.append("<style>"); - css.append(ParsedPageCSS.getFileText()); - css.append("</style>"); - - return css.toString(); - } - - /** - * Generates HTML Output for a {@link SectionContainer} or {@link SectionContent}. - */ - private static String sectionToHtml(Section s) { - - return "<table class=\"Section\">\n" + - "<tr><th class=\"Section\">\n" + - - "<table class=\"SectionTh\"><tr>\n" + - "<th class=\"SectionTh\">\n" + - (s.getClass() == SectionContainer.class ? "SectionStructure" : "SectionContent") + ":<br>\n" + - "Level: " + s.getLevel() + "\n" + - "</th><th class=\"SectionTh\">\n" + - (s.getTitleElement() != null ? contentElementToHtml(s.getTitleElement()) : "") + - "</th>\n" + - "</tr></table>\n" + - - "</th></tr>\n" + - "<tr><td class=\"Section\">\n" + - sectionCCLToHtml(s) + - "</td></tr>\n" + - "</table>\n"; - } - - private static String sectionCCLToHtml(Section s) { - StringBuilder result = new StringBuilder(); - - if (s.getClass() == SectionContainer.class) { - for (Section ss : ((SectionContainer) s).getSubSections()) { - result.append(sectionToHtml(ss)); - } - } else { - List<Content> ccl = s.getContentList(); - for (int i = (s.getTitleElement() != null ? 1 : 0); i < ccl.size(); i++) { - Content c = ccl.get(i); - Class<? extends Content> cc = c.getClass(); - if (cc == Paragraph.class) { - result.append(paragraphToHtml((Paragraph) c)); - } else if (cc == DefinitionList.class) { - result.append(definitionListToHtml((DefinitionList) c)); - } else if (cc == NestedListContainer.class) { - result.append(nestedListToHtml((NestedList) c)); - } else if (cc == Table.class) { - result.append(tableToHtml((Table) c)); - } else { - result.append("\n<pre>UNKNOWN CLASS: " + cc + "\n" + convertTags(c.toString()) + "</pre>\n"); - } - } + /** + * @return Creates and returns the header of the HTML page + */ + private static String getHtmlHeader() + { + StringBuilder header = new StringBuilder(); + header.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>"); + header.append( + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\" http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">"); + header.append("<html>"); + header.append("<head>"); + header.append(getCSS()); + // header.append(" <link href=\""+cssFileName+"\" type=\"text/css\" rel=\"stylesheet\"/>"); + header.append("</head>"); + header.append("<body>"); + + return header.toString(); } - return result.toString(); - } - - /** - * Generates HTML Output for a {@link Paragraph}. - */ - private static String paragraphToHtml(Paragraph p) { - return contentElementToHtml(p, "Paragraph", "Paragraph: " + p.getType()); - } - - /** - * Generates HTML Output for a {@link ContentElement}. - */ - private static String contentElementToHtml(ContentElement ce) { - return contentElementToHtml(ce, "ContentElement", "ContentElement"); - } - - private static String contentElementToHtml(ContentElement ce, String cssClass, String headline) { - - StringBuilder result = new StringBuilder(); - - result.append( - "<table class=\"" + cssClass + "\">\n" + - "<tr><th class=\"" + cssClass + "\">" + headline + "</th></tr>\n" + - "<tr><td class=\"" + cssClass + "\">\n" + - "\"" + convertTags(ce.getText()) + "\"\n" + - "</td></tr>\n"); - - String BoldWords = ce.getText(ce.getFormatSpans(FormatType.BOLD)); - if (BoldWords.length() > 0) { - result.append("<tr><td class=\"" + cssClass + "\">BoldWords: " + convertTags(BoldWords) + "</td></tr>\n"); + /** + * @return Creates and returns the footer of the HTML page + */ + private static String getHtmlFooter() + { + StringBuilder footer = new StringBuilder(); + footer.append("</body>"); + footer.append("</html>"); + + return footer.toString(); } - String ItalicWords = ce.getText(ce.getFormatSpans(FormatType.ITALIC)); - if (ItalicWords.length() > 0) { - result.append("<tr><td class=\"" + cssClass + "\">italicWords: " + convertTags(ItalicWords) + "</td></tr>\n"); + /** + * @return Creates and returns the CSS definitions of the HTML page + */ + private static String getCSS() + { + StringBuilder css = new StringBuilder(); + css.append("<style>"); + css.append(ParsedPageCSS.getFileText()); + css.append("</style>"); + + return css.toString(); } - if (ce.getFormatSpans(FormatType.MATH).size() != 0) { - result.append("<tr><td class=\"" + cssClass + "\">MathTags\n"); - for (Span s : ce.getFormatSpans(FormatType.MATH)) { - result.append(s.toString() + "\n"); - } - result.append("</td></tr>\n"); + /** + * Generates HTML Output for a {@link SectionContainer} or {@link SectionContent}. + */ + private static String sectionToHtml(Section s) + { + + return "<table class=\"Section\">\n" + "<tr><th class=\"Section\">\n" + + + "<table class=\"SectionTh\"><tr>\n" + "<th class=\"SectionTh\">\n" + + (s.getClass() == SectionContainer.class ? "SectionStructure" : "SectionContent") + + ":<br>\n" + "Level: " + s.getLevel() + "\n" + "</th><th class=\"SectionTh\">\n" + + (s.getTitleElement() != null ? contentElementToHtml(s.getTitleElement()) : "") + + "</th>\n" + "</tr></table>\n" + + + "</th></tr>\n" + "<tr><td class=\"Section\">\n" + sectionCCLToHtml(s) + + "</td></tr>\n" + "</table>\n"; } - if (ce.getFormatSpans(FormatType.TAG).size() != 0) { - result.append("<tr><td class=\"" + cssClass + "\">Tags:\n"); - for (Span s : ce.getFormatSpans(FormatType.TAG)) { - result.append(s.toString() + "\n"); - } - result.append("</td></tr>\n"); + private static String sectionCCLToHtml(Section s) + { + StringBuilder result = new StringBuilder(); + + if (s.getClass() == SectionContainer.class) { + for (Section ss : ((SectionContainer) s).getSubSections()) { + result.append(sectionToHtml(ss)); + } + } + else { + List<Content> ccl = s.getContentList(); + for (int i = (s.getTitleElement() != null ? 1 : 0); i < ccl.size(); i++) { + Content c = ccl.get(i); + Class<? extends Content> cc = c.getClass(); + if (cc == Paragraph.class) { + result.append(paragraphToHtml((Paragraph) c)); + } + else if (cc == DefinitionList.class) { + result.append(definitionListToHtml((DefinitionList) c)); + } + else if (cc == NestedListContainer.class) { + result.append(nestedListToHtml((NestedList) c)); + } + else if (cc == Table.class) { + result.append(tableToHtml((Table) c)); + } + else { + result.append("\n<pre>UNKNOWN CLASS: " + cc + "\n" + convertTags(c.toString()) + + "</pre>\n"); + } + } + } + + return result.toString(); } - if (ce.getLinks().size() != 0) { - result.append("<tr><td class=\"" + cssClass + "\">\n"); - for (Link l : ce.getLinks()) { - result.append(linkToHtml(l)); - } - result.append("</td></tr>\n"); + /** + * Generates HTML Output for a {@link Paragraph}. + */ + private static String paragraphToHtml(Paragraph p) + { + return contentElementToHtml(p, "Paragraph", "Paragraph: " + p.getType()); } - if (ce.getTemplates().size() != 0) { - result.append("<tr><td class=\"" + cssClass + "\">\n"); - for (Template t : ce.getTemplates()) { - result.append(templateToHtml(t)); - } - result.append("</td></tr>\n"); + /** + * Generates HTML Output for a {@link ContentElement}. + */ + private static String contentElementToHtml(ContentElement ce) + { + return contentElementToHtml(ce, "ContentElement", "ContentElement"); } - result.append("</table>\n"); + private static String contentElementToHtml(ContentElement ce, String cssClass, String headline) + { - return result.toString(); - } + StringBuilder result = new StringBuilder(); - /** - * Generates HTML Output for a {@link DefinitionList}. - */ - private static String definitionListToHtml(DefinitionList dl) { - if (dl == null) { - return "null"; - } + result.append("<table class=\"" + cssClass + "\">\n" + "<tr><th class=\"" + cssClass + "\">" + + headline + "</th></tr>\n" + "<tr><td class=\"" + cssClass + "\">\n" + "\"" + + convertTags(ce.getText()) + "\"\n" + "</td></tr>\n"); - StringBuilder result = new StringBuilder(); + String BoldWords = ce.getText(ce.getFormatSpans(FormatType.BOLD)); + if (BoldWords.length() > 0) { + result.append("<tr><td class=\"" + cssClass + "\">BoldWords: " + convertTags(BoldWords) + + "</td></tr>\n"); + } - result.append("<table class=\"DefinitionList\">\n" + - "<tr><th class=\"DefinitionList\">DefinitionList</th></tr>\n" + - "<tr><td class=\"DefinitionList\">"); + String ItalicWords = ce.getText(ce.getFormatSpans(FormatType.ITALIC)); + if (ItalicWords.length() > 0) { + result.append("<tr><td class=\"" + cssClass + "\">italicWords: " + + convertTags(ItalicWords) + "</td></tr>\n"); + } - if (dl.getDefinedTerm() != null) { - result.append(contentElementToHtml(dl.getDefinedTerm()) + "\n"); - } + if (ce.getFormatSpans(FormatType.MATH).size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">MathTags\n"); + for (Span s : ce.getFormatSpans(FormatType.MATH)) { + result.append(s.toString() + "\n"); + } + result.append("</td></tr>\n"); + } - result.append("<ul>"); - for (ContentElement ce : dl.getDefinitions()) { - result.append("<li>" + contentElementToHtml(ce) + "</li>"); - } + if (ce.getFormatSpans(FormatType.TAG).size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">Tags:\n"); + for (Span s : ce.getFormatSpans(FormatType.TAG)) { + result.append(s.toString() + "\n"); + } + result.append("</td></tr>\n"); + } + + if (ce.getLinks().size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">\n"); + for (Link l : ce.getLinks()) { + result.append(linkToHtml(l)); + } + result.append("</td></tr>\n"); + } - result.append("</ul>\n"); - result.append("</td></tr>\n"); - result.append("</table>\n"); + if (ce.getTemplates().size() != 0) { + result.append("<tr><td class=\"" + cssClass + "\">\n"); + for (Template t : ce.getTemplates()) { + result.append(templateToHtml(t)); + } + result.append("</td></tr>\n"); + } - return result.toString(); - } + result.append("</table>\n"); - /** - * Generates HTML Output for a {@link NestedList}. - */ - private static String nestedListToHtml(NestedList nl) { - if (nl == null) { - return "null"; + return result.toString(); } - StringBuilder result = new StringBuilder(); + /** + * Generates HTML Output for a {@link DefinitionList}. + */ + private static String definitionListToHtml(DefinitionList dl) + { + if (dl == null) { + return "null"; + } - if (nl.getClass() == NestedListElement.class) { - result.append("<li>\n" + contentElementToHtml((NestedListElement) nl) + "</li>\n"); - } else { - result.append("<table class=\"NestedList\">\n" + - "<tr><th class=\"NestedList\">NestedList</th></tr>\n" + - "<tr><td class=\"NestedList\">"); + StringBuilder result = new StringBuilder(); - result.append((((NestedListContainer) nl).isNumbered() ? "<ol>" : "<ul>") + "\n"); - for (NestedList nl2 : ((NestedListContainer) nl).getNestedLists()) { - result.append(nestedListToHtml(nl2)); - } - result.append((((NestedListContainer) nl).isNumbered() ? "</ol>" : "</ul>") + "\n"); + result.append("<table class=\"DefinitionList\">\n" + + "<tr><th class=\"DefinitionList\">DefinitionList</th></tr>\n" + + "<tr><td class=\"DefinitionList\">"); - result.append("</td></tr>\n"); - result.append("</table>\n"); - } + if (dl.getDefinedTerm() != null) { + result.append(contentElementToHtml(dl.getDefinedTerm()) + "\n"); + } - return result.toString(); - } + result.append("<ul>"); + for (ContentElement ce : dl.getDefinitions()) { + result.append("<li>" + contentElementToHtml(ce) + "</li>"); + } - /** - * Generates HTML Output for a {@link Table}. - */ - private static String tableToHtml(Table t) { + result.append("</ul>\n"); + result.append("</td></tr>\n"); + result.append("</table>\n"); - if (t == null) { - return "null"; + return result.toString(); } - StringBuilder result = new StringBuilder(); + /** + * Generates HTML Output for a {@link NestedList}. + */ + private static String nestedListToHtml(NestedList nl) + { + if (nl == null) { + return "null"; + } - int colspan; - try { - colspan = t.getTableElement(t.nrOfTableElements() - 1).getCol() + 1; - } catch (Exception e) { - colspan = 1; - } + StringBuilder result = new StringBuilder(); - result.append("<table class=\"Table\">\n<tr><th colspan=" + colspan + " class=\"Table\">Table"); + if (nl.getClass() == NestedListElement.class) { + result.append("<li>\n" + contentElementToHtml((NestedListElement) nl) + "</li>\n"); + } + else { + result.append("<table class=\"NestedList\">\n" + + "<tr><th class=\"NestedList\">NestedList</th></tr>\n" + + "<tr><td class=\"NestedList\">"); + + result.append((((NestedListContainer) nl).isNumbered() ? "<ol>" : "<ul>") + "\n"); + for (NestedList nl2 : ((NestedListContainer) nl).getNestedLists()) { + result.append(nestedListToHtml(nl2)); + } + result.append((((NestedListContainer) nl).isNumbered() ? "</ol>" : "</ul>") + "\n"); + + result.append("</td></tr>\n"); + result.append("</table>\n"); + } - if (t.getTitleElement() != null) { - result.append(contentElementToHtml(t.getTitleElement())); + return result.toString(); } - result.append("</th></tr>\n<tr>\n"); + /** + * Generates HTML Output for a {@link Table}. + */ + private static String tableToHtml(Table t) + { + + if (t == null) { + return "null"; + } - int row = 0; - for (int i = 0; i < t.nrOfTableElements(); i++) { - TableElement td = t.getTableElement(i); - if (td.getRow() > row) { - result.append("</tr><tr>\n"); - row = td.getRow(); - } + StringBuilder result = new StringBuilder(); - result.append("<td class=\"Table\">\n" + tableElementToHtml(td) + "</td>\n"); - } + int colspan; + try { + colspan = t.getTableElement(t.nrOfTableElements() - 1).getCol() + 1; + } + catch (Exception e) { + colspan = 1; + } - result.append("</tr>\n</table>\n"); - return result.toString(); - } + result.append( + "<table class=\"Table\">\n<tr><th colspan=" + colspan + " class=\"Table\">Table"); - /** - * Generates HTML Output for a {@link TableElement}. - */ - private static String tableElementToHtml(TableElement td) { - StringBuilder result = new StringBuilder(); + if (t.getTitleElement() != null) { + result.append(contentElementToHtml(t.getTitleElement())); + } - result.append("Row: " + td.getRow() + " Col: " + td.getCol() + "\n"); + result.append("</th></tr>\n<tr>\n"); - if (td.nrOfSections() == 1 && td.getSection(0).getTitleElement() == null) { - result.append(sectionCCLToHtml(td.getSection(0))); - } else { - for (int i = 0; i < td.nrOfSections(); i++) { - result.append(sectionToHtml(td.getSection(i))); - } - } + int row = 0; + for (int i = 0; i < t.nrOfTableElements(); i++) { + TableElement td = t.getTableElement(i); + if (td.getRow() > row) { + result.append("</tr><tr>\n"); + row = td.getRow(); + } - return result.toString(); - } + result.append("<td class=\"Table\">\n" + tableElementToHtml(td) + "</td>\n"); + } - /** - * Generates HTML Output for a {@link Link}. - */ - private static String linkToHtml(Link l) { - if (l == null) { - return "null"; + result.append("</tr>\n</table>\n"); + return result.toString(); } - StringBuilder result = new StringBuilder(); + /** + * Generates HTML Output for a {@link TableElement}. + */ + private static String tableElementToHtml(TableElement td) + { + StringBuilder result = new StringBuilder(); + + result.append("Row: " + td.getRow() + " Col: " + td.getCol() + "\n"); - result.append("<div class=\"Link\"><b class=\"Link\">Link:</b>" + - l.getType() + ": \"" + - convertTags(l.getText()) + "\" -> \"" + convertTags(l.getTarget()) + "\""); + if (td.nrOfSections() == 1 && td.getSection(0).getTitleElement() == null) { + result.append(sectionCCLToHtml(td.getSection(0))); + } + else { + for (int i = 0; i < td.nrOfSections(); i++) { + result.append(sectionToHtml(td.getSection(i))); + } + } - if (l.getParameters().size() != 0) { - for (String parameter : l.getParameters()) { - result.append("<br>\nPARAMETER: \"" + convertTags(parameter) + "\""); - } + return result.toString(); } - result.append("</div>\n"); + /** + * Generates HTML Output for a {@link Link}. + */ + private static String linkToHtml(Link l) + { + if (l == null) { + return "null"; + } - return result.toString(); - } + StringBuilder result = new StringBuilder(); - /** - * Generates HTML Output for a {@link Template}. - */ - private static String templateToHtml(Template t) { - if (t == null) { - return "null"; - } + result.append("<div class=\"Link\"><b class=\"Link\">Link:</b>" + l.getType() + ": \"" + + convertTags(l.getText()) + "\" -> \"" + convertTags(l.getTarget()) + "\""); - StringBuilder result = new StringBuilder(); - - result.append( - "<table class=\"Template\">\n" + - "<tr><th class=\"Template\">Template</th></tr>\n" + - "<tr><td class=\"Template\">" + - "Name: \"" + convertTags(t.getName()) + "\"<br>" + - "</td></tr>\n"); - - if (t.getParameters().size() != 0) { - result.append("<tr><td class=\"Template\">"); - for (String parameter : t.getParameters()) { - result.append("Parameter: \"" + convertTags(parameter) + "\"<br>"); - } - result.append("</td></tr>\n"); + if (l.getParameters().size() != 0) { + for (String parameter : l.getParameters()) { + result.append("<br>\nPARAMETER: \"" + convertTags(parameter) + "\""); + } + } + + result.append("</div>\n"); + + return result.toString(); } - result.append("</table>"); + /** + * Generates HTML Output for a {@link Template}. + */ + private static String templateToHtml(Template t) + { + if (t == null) { + return "null"; + } - return result.toString(); - } + StringBuilder result = new StringBuilder(); - private static String convertTags(String s) { - if (s == null) { - return null; - } + result.append("<table class=\"Template\">\n" + + "<tr><th class=\"Template\">Template</th></tr>\n" + "<tr><td class=\"Template\">" + + "Name: \"" + convertTags(t.getName()) + "\"<br>" + "</td></tr>\n"); - StringBuilder result = new StringBuilder(s); + if (t.getParameters().size() != 0) { + result.append("<tr><td class=\"Template\">"); + for (String parameter : t.getParameters()) { + result.append("Parameter: \"" + convertTags(parameter) + "\"<br>"); + } + result.append("</td></tr>\n"); + } - int temp; + result.append("</table>"); - temp = 0; - while ((temp = result.indexOf("<", temp)) != -1) { - result.replace(temp, temp + 1, "<"); + return result.toString(); } - temp = 0; - while ((temp = result.indexOf(">", temp)) != -1) { - result.replace(temp, temp + 1, ">"); - } + private static String convertTags(String s) + { + if (s == null) { + return null; + } - return result.toString(); - } + StringBuilder result = new StringBuilder(s); - public static void writeFile(String filename, String encoding, String text) { + int temp; + + temp = 0; + while ((temp = result.indexOf("<", temp)) != -1) { + result.replace(temp, temp + 1, "<"); + } + + temp = 0; + while ((temp = result.indexOf(">", temp)) != -1) { + result.replace(temp, temp + 1, ">"); + } - File outFile = new File(filename); - try (Writer destFile = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(outFile)), encoding))) { - destFile.write(text); - } catch (UnsupportedEncodingException e1) { - logger.error("Unsupported encoding exception while opening file '{}'", outFile.getAbsolutePath(), e1); - } catch (FileNotFoundException e1) { - logger.error("File '{}' not found.", outFile.getAbsolutePath(), e1); - } catch (IOException e) { - logger.error("IO exception while writing file '{}", outFile.getAbsolutePath(), e); + return result.toString(); + } + + public static void writeFile(String filename, String encoding, String text) + { + + File outFile = new File(filename); + try (Writer destFile = new BufferedWriter(new OutputStreamWriter( + new BufferedOutputStream(new FileOutputStream(outFile)), encoding))) { + destFile.write(text); + } + catch (UnsupportedEncodingException e1) { + logger.error("Unsupported encoding exception while opening file '{}'", + outFile.getAbsolutePath(), e1); + } + catch (FileNotFoundException e1) { + logger.error("File '{}' not found.", outFile.getAbsolutePath(), e1); + } + catch (IOException e) { + logger.error("IO exception while writing file '{}", outFile.getAbsolutePath(), e); + } } - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java index 55b621eb..8874838d 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/html/ParsedPageCSS.java @@ -17,100 +17,103 @@ */ package org.dkpro.jwpl.parser.html; -public class ParsedPageCSS { +public class ParsedPageCSS +{ - private static final String LF = "\n"; + private static final String LF = "\n"; - public static String getFileText() { - StringBuilder sb = new StringBuilder(); + public static String getFileText() + { + StringBuilder sb = new StringBuilder(); - sb.append("body"); - sb.append("{"); - sb.append(" font-size: 10pt;"); - sb.append(" font-family: Arial;"); - sb.append("}"); - sb.append(LF); - sb.append("table"); - sb.append("{"); - sb.append(" border-collapse: collapse;"); - sb.append(" border-spacing: 10px;"); - sb.append(" margin: 10px;"); - sb.append(" vertical-align: top;"); - sb.append("}"); - sb.append(LF); - sb.append("th{"); - sb.append(" text-align: left;"); - sb.append(" border-width: 1px;"); - sb.append(" border-color: #000000;"); - sb.append(" border-style: solid;"); - sb.append(LF); - sb.append(" font-size: 10pt;"); - sb.append(" font-family: Arial;"); - sb.append(" font-weight: normal;"); - sb.append(" "); - sb.append(" padding: 10px;"); - sb.append("}"); - sb.append(LF); - sb.append("td{"); - sb.append(" border-width: 1px;"); - sb.append(" border-color: #000000;"); - sb.append(" border-style: solid;"); - sb.append(" "); - sb.append(" font-size: 10pt;"); - sb.append(" font-family: monospace;"); - sb.append(" vertical-align: top;"); - sb.append(" "); - sb.append(" padding: 10px;"); - sb.append("}"); - sb.append(LF); - sb.append("table.ParsedPage{}"); - sb.append("th.ParsedPage{ background-color: #FF8900; }"); - sb.append("td.ParsedPage{ background-color: #FFD29E; }"); - sb.append(LF); - sb.append("table.Section{ width: 100%; }"); - sb.append("th.Section{ margin: 0px; padding: 0px; background-color: #FFFF00; }"); - sb.append(" table.SectionTh{ margin: 0px;}"); - sb.append(" th.SectionTh{ border-width: 0px; border-style:none; background-color: #FFFF00; vertical-align: middle; }"); - sb.append("td.Section{ background-color: #EEEEEE; }"); - sb.append(LF); - sb.append("table.Template{ margin: 2px; }"); - sb.append("th.Template{ font-size: 7pt; padding: 1px; background-color: #99CCCC; }"); - sb.append("td.Template{ padding: 5px; }"); - sb.append(""); - sb.append("table.Table{ margin: 2px; background-color: #EEEEEE; }"); - sb.append("th.Table{ font-size: 7pt; padding: 1px; background-color: #FF0000; }"); - sb.append("td.Table{ padding: 5px; background-color: #FFCCCC;}"); - sb.append(LF); - sb.append(LF); - sb.append("b.Link{ color: #0000FF; }"); - sb.append("div.Link{"); - sb.append(" padding-left: 5px;"); - sb.append(" padding-right: 5px;"); - sb.append(" margin: 1px;"); - sb.append(" border-width: 1px;"); - sb.append(" border-color: #999999;"); - sb.append(" border-style: solid; "); - sb.append(" background-color: #EEEEEE;"); - sb.append("}"); - sb.append(LF); - sb.append("table.ContentElement{ margin: 2px; }"); - sb.append("th.ContentElement{ font-size: 7pt; padding: 1px; background-color: #6699CC; }"); - sb.append("td.ContentElement{ padding: 5px; background-color: #FFFFFF;}"); - sb.append(LF); - sb.append("table.Paragraph{ margin: 2px; }"); - sb.append("th.Paragraph{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); - sb.append("td.Paragraph{ padding: 5px; background-color: #FFFFFF; }"); - sb.append(LF); - sb.append("table.NestedList{ margin: 2px; }"); - sb.append("th.NestedList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); - sb.append("td.NestedList{ padding: 5px; background-color: #CCFFCC; }"); - sb.append(LF); - sb.append("table.DefinitionList{ margin: 2px; }"); - sb.append("th.DefinitionList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); - sb.append("td.DefinitionList{ padding: 5px; background-color: #CCFFCC; }"); - sb.append(LF); + sb.append("body"); + sb.append("{"); + sb.append(" font-size: 10pt;"); + sb.append(" font-family: Arial;"); + sb.append("}"); + sb.append(LF); + sb.append("table"); + sb.append("{"); + sb.append(" border-collapse: collapse;"); + sb.append(" border-spacing: 10px;"); + sb.append(" margin: 10px;"); + sb.append(" vertical-align: top;"); + sb.append("}"); + sb.append(LF); + sb.append("th{"); + sb.append(" text-align: left;"); + sb.append(" border-width: 1px;"); + sb.append(" border-color: #000000;"); + sb.append(" border-style: solid;"); + sb.append(LF); + sb.append(" font-size: 10pt;"); + sb.append(" font-family: Arial;"); + sb.append(" font-weight: normal;"); + sb.append(" "); + sb.append(" padding: 10px;"); + sb.append("}"); + sb.append(LF); + sb.append("td{"); + sb.append(" border-width: 1px;"); + sb.append(" border-color: #000000;"); + sb.append(" border-style: solid;"); + sb.append(" "); + sb.append(" font-size: 10pt;"); + sb.append(" font-family: monospace;"); + sb.append(" vertical-align: top;"); + sb.append(" "); + sb.append(" padding: 10px;"); + sb.append("}"); + sb.append(LF); + sb.append("table.ParsedPage{}"); + sb.append("th.ParsedPage{ background-color: #FF8900; }"); + sb.append("td.ParsedPage{ background-color: #FFD29E; }"); + sb.append(LF); + sb.append("table.Section{ width: 100%; }"); + sb.append("th.Section{ margin: 0px; padding: 0px; background-color: #FFFF00; }"); + sb.append(" table.SectionTh{ margin: 0px;}"); + sb.append( + " th.SectionTh{ border-width: 0px; border-style:none; background-color: #FFFF00; vertical-align: middle; }"); + sb.append("td.Section{ background-color: #EEEEEE; }"); + sb.append(LF); + sb.append("table.Template{ margin: 2px; }"); + sb.append("th.Template{ font-size: 7pt; padding: 1px; background-color: #99CCCC; }"); + sb.append("td.Template{ padding: 5px; }"); + sb.append(""); + sb.append("table.Table{ margin: 2px; background-color: #EEEEEE; }"); + sb.append("th.Table{ font-size: 7pt; padding: 1px; background-color: #FF0000; }"); + sb.append("td.Table{ padding: 5px; background-color: #FFCCCC;}"); + sb.append(LF); + sb.append(LF); + sb.append("b.Link{ color: #0000FF; }"); + sb.append("div.Link{"); + sb.append(" padding-left: 5px;"); + sb.append(" padding-right: 5px;"); + sb.append(" margin: 1px;"); + sb.append(" border-width: 1px;"); + sb.append(" border-color: #999999;"); + sb.append(" border-style: solid; "); + sb.append(" background-color: #EEEEEE;"); + sb.append("}"); + sb.append(LF); + sb.append("table.ContentElement{ margin: 2px; }"); + sb.append("th.ContentElement{ font-size: 7pt; padding: 1px; background-color: #6699CC; }"); + sb.append("td.ContentElement{ padding: 5px; background-color: #FFFFFF;}"); + sb.append(LF); + sb.append("table.Paragraph{ margin: 2px; }"); + sb.append("th.Paragraph{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); + sb.append("td.Paragraph{ padding: 5px; background-color: #FFFFFF; }"); + sb.append(LF); + sb.append("table.NestedList{ margin: 2px; }"); + sb.append("th.NestedList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); + sb.append("td.NestedList{ padding: 5px; background-color: #CCFFCC; }"); + sb.append(LF); + sb.append("table.DefinitionList{ margin: 2px; }"); + sb.append("th.DefinitionList{ font-size: 7pt; padding: 1px; background-color: #66CC00; }"); + sb.append("td.DefinitionList{ padding: 5px; background-color: #CCFFCC; }"); + sb.append(LF); - return sb.toString(); - } + return sb.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java index 4ecb30fa..31423eed 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/EmptyStructureRemover.java @@ -28,130 +28,146 @@ import org.dkpro.jwpl.parser.Table; import org.dkpro.jwpl.parser.TableElement; - /** - * It is possible that some Elements which has been parsed are empty after - * the Parsing process becaus of the options which has been set. This class - * can remove these empty elmentens. + * It is possible that some Elements which has been parsed are empty after the Parsing process + * becaus of the options which has been set. This class can remove these empty elmentens. */ -class EmptyStructureRemover { - - /** - * Removes all empty Structures from a SectionContainer and all substructures. - */ - public static SectionContainer eliminateEmptyStructures(SectionContainer sc) { - - for (int i = sc.nrOfSubSections() - 1; i >= 0; i--) { - Section ss = sc.getSubSection(i); - - if (ss.getClass() == SectionContainer.class) { - SectionContainer sci = (SectionContainer) ss; - eliminateEmptyStructures(sci); - } else if (ss.getClass() == SectionContent.class) - eliminateEmptyStructures((SectionContent) ss); - - if (ss.empty()) sc.removeSection(ss); +class EmptyStructureRemover +{ + + /** + * Removes all empty Structures from a SectionContainer and all substructures. + */ + public static SectionContainer eliminateEmptyStructures(SectionContainer sc) + { + + for (int i = sc.nrOfSubSections() - 1; i >= 0; i--) { + Section ss = sc.getSubSection(i); + + if (ss.getClass() == SectionContainer.class) { + SectionContainer sci = (SectionContainer) ss; + eliminateEmptyStructures(sci); + } + else if (ss.getClass() == SectionContent.class) + eliminateEmptyStructures((SectionContent) ss); + + if (ss.empty()) + sc.removeSection(ss); + } + + // encapsulating Sections + if (sc.nrOfSubSections() == 1 && sc.getSubSection(0).getClass() == SectionContainer.class) { + SectionContainer sc0 = (SectionContainer) sc.getSubSection(0); + if (sc0.getTitleElement() == null) { + sc.removeSection(sc0); + for (int i = 0; i < sc0.nrOfSubSections(); i++) + sc.addSection(sc0.getSubSection(i)); + } + } + + return sc; } - //encapsulating Sections - if (sc.nrOfSubSections() == 1 && sc.getSubSection(0).getClass() == SectionContainer.class) { - SectionContainer sc0 = (SectionContainer) sc.getSubSection(0); - if (sc0.getTitleElement() == null) { - sc.removeSection(sc0); - for (int i = 0; i < sc0.nrOfSubSections(); i++) - sc.addSection(sc0.getSubSection(i)); - } - } - - return sc; - } - - /** - * Removes all empty Structures from a SectionContent and all substructures. - */ - public static SectionContent eliminateEmptyStructures(SectionContent sc) { - - for (int i = sc.nrOfParagraphs() - 1; i >= 0; i--) { - Paragraph p = sc.getParagraph(i); - if (p.empty()) sc.removeParagraph(p); + /** + * Removes all empty Structures from a SectionContent and all substructures. + */ + public static SectionContent eliminateEmptyStructures(SectionContent sc) + { + + for (int i = sc.nrOfParagraphs() - 1; i >= 0; i--) { + Paragraph p = sc.getParagraph(i); + if (p.empty()) + sc.removeParagraph(p); + } + + for (int i = sc.nrOfDefinitionLists() - 1; i >= 0; i--) { + DefinitionList dl = sc.getDefinitionList(i); + eliminateEmptyStructures(dl); + if (dl.empty()) + sc.removeDefinitionList(dl); + } + + for (int i = sc.nrOfNestedLists() - 1; i >= 0; i--) { + NestedListContainer nl = sc.getNestedList(i); + eliminateEmptyStructures(nl); + if (nl.empty()) + sc.removeNestedList(nl); + } + + for (int i = sc.nrOfTables() - 1; i >= 0; i--) { + Table t = sc.getTable(i); + eliminateEmptyStructures(t); + if (t.empty()) + sc.removeTable(t); + } + + return sc; } - for (int i = sc.nrOfDefinitionLists() - 1; i >= 0; i--) { - DefinitionList dl = sc.getDefinitionList(i); - eliminateEmptyStructures(dl); - if (dl.empty()) sc.removeDefinitionList(dl); + /** + * Removes all empty Structures from a NestedListContainer and all substructures. + */ + public static NestedListContainer eliminateEmptyStructures(NestedListContainer nlc) + { + for (int i = nlc.size() - 1; i >= 0; i--) { + NestedList nl = nlc.getNestedList(i); + if (nl.getClass() == NestedListContainer.class) + eliminateEmptyStructures((NestedListContainer) nl); + + if (nl.empty()) + nlc.remove(nl); + } + return nlc; } - for (int i = sc.nrOfNestedLists() - 1; i >= 0; i--) { - NestedListContainer nl = sc.getNestedList(i); - eliminateEmptyStructures(nl); - if (nl.empty()) sc.removeNestedList(nl); + /** + * Removes all empty Structures from a Table and all substructures. + */ + public static Table eliminateEmptyStructures(Table t) + { + for (int i = t.nrOfTableElements() - 1; i >= 0; i--) { + TableElement te = t.getTableElement(i); + eliminateEmptyStructures(te); + if (te.empty()) + t.removeTableElement(te); + } + return t; } - for (int i = sc.nrOfTables() - 1; i >= 0; i--) { - Table t = sc.getTable(i); - eliminateEmptyStructures(t); - if (t.empty()) sc.removeTable(t); + /** + * Removes all empty Structures from a TableElement and all substructures. + */ + public static TableElement eliminateEmptyStructures(TableElement te) + { + for (int i = te.nrOfSections() - 1; i >= 0; i--) { + Section s = te.getSection(i); + + if (s.getClass() == SectionContainer.class) + eliminateEmptyStructures((SectionContainer) s); + else if (s.getClass() == SectionContent.class) + eliminateEmptyStructures((SectionContent) s); + + if (s.empty()) + te.removeSection(s); + } + return te; } - return sc; - } - - /** - * Removes all empty Structures from a NestedListContainer and all substructures. - */ - public static NestedListContainer eliminateEmptyStructures(NestedListContainer nlc) { - for (int i = nlc.size() - 1; i >= 0; i--) { - NestedList nl = nlc.getNestedList(i); - if (nl.getClass() == NestedListContainer.class) - eliminateEmptyStructures((NestedListContainer) nl); - - if (nl.empty()) nlc.remove(nl); - } - return nlc; - } - - /** - * Removes all empty Structures from a Table and all substructures. - */ - public static Table eliminateEmptyStructures(Table t) { - for (int i = t.nrOfTableElements() - 1; i >= 0; i--) { - TableElement te = t.getTableElement(i); - eliminateEmptyStructures(te); - if (te.empty()) t.removeTableElement(te); - } - return t; - } - - /** - * Removes all empty Structures from a TableElement and all substructures. - */ - public static TableElement eliminateEmptyStructures(TableElement te) { - for (int i = te.nrOfSections() - 1; i >= 0; i--) { - Section s = te.getSection(i); - - if (s.getClass() == SectionContainer.class) - eliminateEmptyStructures((SectionContainer) s); - else if (s.getClass() == SectionContent.class) - eliminateEmptyStructures((SectionContent) s); - - if (s.empty()) te.removeSection(s); - } - return te; - } - - /** - * Removes all empty Structures from a DefinitionList and all substructures. - */ - public static DefinitionList eliminateEmptyStructures(DefinitionList dl) { - - ContentElement dt = dl.getDefinedTerm(); - if (dt != null && dt.empty()) dl.setDefinedTerm(null); - - for (int i = dl.nrOfDefinitions() - 1; i >= 0; i--) { - ContentElement ce = dl.getDefinition(i); - if (ce.empty()) dl.removeDefinition(ce); + /** + * Removes all empty Structures from a DefinitionList and all substructures. + */ + public static DefinitionList eliminateEmptyStructures(DefinitionList dl) + { + + ContentElement dt = dl.getDefinedTerm(); + if (dt != null && dt.empty()) + dl.setDefinedTerm(null); + + for (int i = dl.nrOfDefinitions() - 1; i >= 0; i--) { + ContentElement ce = dl.getDefinition(i); + if (ce.empty()) + dl.removeDefinition(ce); + } + return dl; } - return dl; - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java index 95208060..c5107253 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/FlushTemplates.java @@ -23,17 +23,21 @@ /** * This TemplateParser will delete ALL templates, whitout any exception! */ -public final class FlushTemplates implements MediaWikiTemplateParser { +public final class FlushTemplates + implements MediaWikiTemplateParser +{ - public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { - ResolvedTemplate result = new ResolvedTemplate(t); - result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); - result.setPostParseReplacement(""); - result.setParsedObject(null); - return result; - } + public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) + { + ResolvedTemplate result = new ResolvedTemplate(t); + result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); + result.setPostParseReplacement(""); + result.setParsedObject(null); + return result; + } - public String configurationInfo() { - return "All Templates will be Deleted"; - } + public String configurationInfo() + { + return "All Templates will be Deleted"; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java index df112b3f..29f04657 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/GermanTemplateParser.java @@ -27,114 +27,128 @@ import org.slf4j.LoggerFactory; /** - * This is the TemplateParser for the german language, with special treatment - * for all the german templates, like "Dieser Artikel" or "Deutschlandlastig". + * This is the TemplateParser for the german language, with special treatment for all the german + * templates, like "Dieser Artikel" or "Deutschlandlastig". */ -public class GermanTemplateParser implements MediaWikiTemplateParser { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private final String templatePrefix = "TEMPLATE["; - private final String templatePostfix = "]"; - private final String parameterDivisor = ", "; - private final String templateNotImplementedPrefix = "TEMPLATE NOT IMPLEMENTED["; - private final String templateNotImplementedPostfix = "]"; - private final String emptyLinkText = "[ ]"; - - // private MediaWikiContentElementParser parser; - private final List<String> deleteTemplates; - private final List<String> parseTemplates; - - public GermanTemplateParser(MediaWikiContentElementParser parser, List<String> deleteTemplates, List<String> parseTemplates) { - this.deleteTemplates = deleteTemplates; - this.parseTemplates = parseTemplates; -// this.parser = parser; - } - - public String configurationInfo() { - StringBuilder result = new StringBuilder(); - result.append("Standard Template treatment: ShowNameAndParameters"); - result.append("\nDelete Templates: "); - for (String s : deleteTemplates) { - result.append("\"" + s + "\" "); +public class GermanTemplateParser + implements MediaWikiTemplateParser +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private final String templatePrefix = "TEMPLATE["; + private final String templatePostfix = "]"; + private final String parameterDivisor = ", "; + private final String templateNotImplementedPrefix = "TEMPLATE NOT IMPLEMENTED["; + private final String templateNotImplementedPostfix = "]"; + private final String emptyLinkText = "[ ]"; + + // private MediaWikiContentElementParser parser; + private final List<String> deleteTemplates; + private final List<String> parseTemplates; + + public GermanTemplateParser(MediaWikiContentElementParser parser, List<String> deleteTemplates, + List<String> parseTemplates) + { + this.deleteTemplates = deleteTemplates; + this.parseTemplates = parseTemplates; + // this.parser = parser; } - result.append("\nParse Templates: "); - for (String s : parseTemplates) { - result.append("\"" + s + "\" "); + + public String configurationInfo() + { + StringBuilder result = new StringBuilder(); + result.append("Standard Template treatment: ShowNameAndParameters"); + result.append("\nDelete Templates: "); + for (String s : deleteTemplates) { + result.append("\"" + s + "\" "); + } + result.append("\nParse Templates: "); + for (String s : parseTemplates) { + result.append("\"" + s + "\" "); + } + return result.toString(); } - return result.toString(); - } - public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { + public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) + { - final String templateName = t.getName(); + final String templateName = t.getName(); - //Show Name and Parameters as Standart treatment. - ResolvedTemplate result = new ResolvedTemplate(t); - result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); - StringBuilder sb = new StringBuilder(); - sb.append(templatePrefix); - sb.append(t.getName() + parameterDivisor); - for (String s : t.getParameters()) { - sb.append(s + parameterDivisor); - } - sb.delete(sb.length() - parameterDivisor.length(), sb.length()); - sb.append(templatePostfix); - result.setPostParseReplacement(sb.toString()); + // Show Name and Parameters as Standart treatment. + ResolvedTemplate result = new ResolvedTemplate(t); + result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); + StringBuilder sb = new StringBuilder(); + sb.append(templatePrefix); + sb.append(t.getName() + parameterDivisor); + for (String s : t.getParameters()) { + sb.append(s + parameterDivisor); + } + sb.delete(sb.length() - parameterDivisor.length(), sb.length()); + sb.append(templatePostfix); + result.setPostParseReplacement(sb.toString()); + + result.setParsedObject(t); + + // Delete Template if it is in the List + for (String s : deleteTemplates) { + if (s.equals(templateName)) { + result.setPostParseReplacement(""); + result.setParsedObject(null); + return result; + } + } - result.setParsedObject(t); + // Parse Template if it is in the List + for (String s : parseTemplates) { + List<String> templateParameters = t.getParameters(); + + if (s.equals(templateName)) { + logger.info("ParseTemplate: {}", templateName); + if (templateName.equals("Dieser Artikel")) { + + // I removed that from the core API, as it is not likely to be present in most + // non-German articles. (TZ) + // pp.setAboutArticle( parser.parseContentElement( templateParameters.get(0) )); + + result.setPostParseReplacement(""); + result.setParsedObject(null); + return result; + } + else if (templateName.equals("Audio") || templateName.equals("Audio genau")) { + if (templateParameters.size() == 0) { + break; + } + if (templateParameters.size() == 1) { + templateParameters.add(emptyLinkText); + } + result.setPostParseReplacement(t.getParameters().get(1)); + result.setParsedObject(new Link(null, t.getPos(), templateParameters.get(0), + Link.type.AUDIO, null)); + + return result; + } + else if (templateName.equals("Video")) { + if (templateParameters.size() == 0) { + break; + } + if (templateParameters.size() == 1) { + templateParameters.add(emptyLinkText); + } + result.setPostParseReplacement(t.getParameters().get(1)); + result.setParsedObject(new Link(null, t.getPos(), t.getParameters().get(0), + Link.type.VIDEO, null)); + return result; + } + else { + result.setPostParseReplacement(templateNotImplementedPrefix + templateName + + templateNotImplementedPostfix); + return result; + } + } + } - //Delete Template if it is in the List - for (String s : deleteTemplates) { - if (s.equals(templateName)) { - result.setPostParseReplacement(""); - result.setParsedObject(null); return result; - } } - - //Parse Template if it is in the List - for (String s : parseTemplates) { - List<String> templateParameters = t.getParameters(); - - if (s.equals(templateName)) { - logger.info("ParseTemplate: {}", templateName); - if (templateName.equals("Dieser Artikel")) { - -// I removed that from the core API, as it is not likely to be present in most non-German articles. (TZ) -// pp.setAboutArticle( parser.parseContentElement( templateParameters.get(0) )); - - result.setPostParseReplacement(""); - result.setParsedObject(null); - return result; - } else if (templateName.equals("Audio") || templateName.equals("Audio genau")) { - if (templateParameters.size() == 0) { - break; - } - if (templateParameters.size() == 1) { - templateParameters.add(emptyLinkText); - } - result.setPostParseReplacement(t.getParameters().get(1)); - result.setParsedObject(new Link(null, t.getPos(), templateParameters.get(0), Link.type.AUDIO, null)); - - return result; - } else if (templateName.equals("Video")) { - if (templateParameters.size() == 0) { - break; - } - if (templateParameters.size() == 1) { - templateParameters.add(emptyLinkText); - } - result.setPostParseReplacement(t.getParameters().get(1)); - result.setParsedObject(new Link(null, t.getPos(), t.getParameters().get(0), Link.type.VIDEO, null)); - return result; - } else { - result.setPostParseReplacement(templateNotImplementedPrefix + templateName + templateNotImplementedPostfix); - return result; - } - } - } - - return result; - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java index 38d97715..ef8ebb8a 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiContentElementParser.java @@ -20,12 +20,13 @@ import org.dkpro.jwpl.parser.ContentElement; /** - * This Interface makes it possible to parse a single content element. - * Some TemplateParses might uses this Feauture. + * This Interface makes it possible to parse a single content element. Some TemplateParses might + * uses this Feauture. */ -interface MediaWikiContentElementParser { - /** - * Parses a ContentElement from a String. - */ - ContentElement parseContentElement(String src); +interface MediaWikiContentElementParser +{ + /** + * Parses a ContentElement from a String. + */ + ContentElement parseContentElement(String src); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java index 6b1643f8..913de4d9 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParser.java @@ -20,23 +20,23 @@ import org.dkpro.jwpl.parser.ParsedPage; /** - * This is an Interface for MediaWiki Parsers. Which simply "converts" - * MediaWiki Source, given as a String, to a ParsedPage + * This is an Interface for MediaWiki Parsers. Which simply "converts" MediaWiki Source, given as a + * String, to a ParsedPage */ -public interface MediaWikiParser { - /** - * Parses MediaWiki Source, given as parameter src, and returns a ParsedPage. - */ - ParsedPage parse(String src); +public interface MediaWikiParser +{ + /** + * Parses MediaWiki Source, given as parameter src, and returns a ParsedPage. + */ + ParsedPage parse(String src); - /** - * Retruns information abour the configuration of the parser. - */ - String configurationInfo(); + /** + * Retruns information abour the configuration of the parser. + */ + String configurationInfo(); - /** - * Retruns the String which is uses as line separator, usually it - * will be "\n" or "\r\n" - */ - String getLineSeparator(); + /** + * Retruns the String which is uses as line separator, usually it will be "\n" or "\r\n" + */ + String getLineSeparator(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java index f0663fb3..6b7d00e7 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiParserFactory.java @@ -28,598 +28,634 @@ /** * A factory for easy creation of a configured {@link MediaWikiParser}. */ -public class MediaWikiParserFactory { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - private Class parserClass; - private Class templateParserClass; - private String lineSeparator; - private List<String> deleteTemplates; - private List<String> parseTemplates; - private List<String> categoryIdentifers; - private List<String> languageIdentifers; - private List<String> imageIdentifers; - private boolean showImageText; - private boolean deleteTags; - private boolean showMathTagContent; - private boolean calculateSrcSpans; - - /** - * Creates a new un-configured {@link MediaWikiParserFactory}. - */ - public MediaWikiParserFactory() { - initVariables(); - } - - /** - * Creates a fully configured {@link MediaWikiParserFactory} for the specified {@link Language}.<br> - * Next step is {@link MediaWikiParserFactory#createParser()}. - */ - public MediaWikiParserFactory(Language language) { - initVariables(); - if (language.equals(Language.german)) { - initGermanVariables(); - } else if (language.equals(Language.english)) { - initEnglishVariables(); - } else { - logger.warn("No language specific parser for '{}' available. Using default values.", language); - } - } - - private void initVariables() { - lineSeparator = "LF"; - parserClass = ModularParser.class; - imageIdentifers = new ArrayList<>(); - categoryIdentifers = new ArrayList<>(); - languageIdentifers = new ArrayList<>(); - deleteTemplates = new ArrayList<>(); - parseTemplates = new ArrayList<>(); - showImageText = false; - deleteTags = true; - showMathTagContent = true; - calculateSrcSpans = false; - templateParserClass = ShowTemplateNamesAndParameters.class; - - initLanguages(); - } - - private void initLanguages() { - //Init the Languages... - languageIdentifers.add("aa"); - languageIdentifers.add("ab"); - languageIdentifers.add("af"); - languageIdentifers.add("am"); - languageIdentifers.add("an"); - languageIdentifers.add("ar"); - languageIdentifers.add("as"); - languageIdentifers.add("av"); - languageIdentifers.add("ay"); - languageIdentifers.add("az"); - - languageIdentifers.add("ba"); - languageIdentifers.add("be"); - languageIdentifers.add("bg"); - languageIdentifers.add("bh"); - languageIdentifers.add("bi"); - languageIdentifers.add("bm"); - languageIdentifers.add("bn"); - languageIdentifers.add("bo"); - languageIdentifers.add("br"); - languageIdentifers.add("bs"); - - languageIdentifers.add("ca"); - languageIdentifers.add("ce"); - languageIdentifers.add("ch"); - languageIdentifers.add("co"); - languageIdentifers.add("cr"); - languageIdentifers.add("cs"); - languageIdentifers.add("cv"); - languageIdentifers.add("cy"); - - languageIdentifers.add("da"); - languageIdentifers.add("de"); - languageIdentifers.add("dk"); - languageIdentifers.add("dv"); - languageIdentifers.add("dz"); - - languageIdentifers.add("ee"); - languageIdentifers.add("el"); - languageIdentifers.add("en"); - languageIdentifers.add("eo"); - languageIdentifers.add("es"); - languageIdentifers.add("et"); - languageIdentifers.add("eu"); - - languageIdentifers.add("fa"); - languageIdentifers.add("ff"); - languageIdentifers.add("fi"); - languageIdentifers.add("fj"); - languageIdentifers.add("fo"); - languageIdentifers.add("fr"); - languageIdentifers.add("fy"); - - languageIdentifers.add("ga"); - languageIdentifers.add("gd"); - languageIdentifers.add("gl"); - languageIdentifers.add("gn"); - languageIdentifers.add("gu"); - languageIdentifers.add("gv"); - - languageIdentifers.add("ha"); - languageIdentifers.add("he"); - languageIdentifers.add("hi"); - languageIdentifers.add("hr"); - languageIdentifers.add("ht"); - languageIdentifers.add("hu"); - languageIdentifers.add("hy"); - - languageIdentifers.add("ia"); - languageIdentifers.add("id"); - languageIdentifers.add("ie"); - languageIdentifers.add("ig"); - languageIdentifers.add("ii"); - languageIdentifers.add("ik"); - languageIdentifers.add("io"); - languageIdentifers.add("is"); - languageIdentifers.add("it"); - languageIdentifers.add("iu"); - - languageIdentifers.add("ja"); - languageIdentifers.add("jv"); - - languageIdentifers.add("ka"); - languageIdentifers.add("kg"); - languageIdentifers.add("ki"); - languageIdentifers.add("kk"); - languageIdentifers.add("kl"); - languageIdentifers.add("km"); - languageIdentifers.add("kn"); - languageIdentifers.add("ko"); - languageIdentifers.add("ks"); - languageIdentifers.add("ku"); - languageIdentifers.add("kv"); - languageIdentifers.add("kw"); - languageIdentifers.add("ky"); - - languageIdentifers.add("la"); - languageIdentifers.add("lb"); - languageIdentifers.add("li"); - languageIdentifers.add("ln"); - languageIdentifers.add("lo"); - languageIdentifers.add("lt"); - languageIdentifers.add("lv"); - - languageIdentifers.add("mg"); - languageIdentifers.add("mh"); - languageIdentifers.add("mi"); - languageIdentifers.add("mk"); - languageIdentifers.add("ml"); - languageIdentifers.add("mn"); - languageIdentifers.add("mo"); - languageIdentifers.add("mr"); - languageIdentifers.add("ms"); - languageIdentifers.add("mt"); - languageIdentifers.add("my"); - - languageIdentifers.add("na"); - languageIdentifers.add("nb"); - languageIdentifers.add("ne"); - languageIdentifers.add("ng"); - languageIdentifers.add("nl"); - languageIdentifers.add("nn"); - languageIdentifers.add("no"); - languageIdentifers.add("nv"); - languageIdentifers.add("ny"); - - languageIdentifers.add("oc"); - languageIdentifers.add("os"); - languageIdentifers.add("pa"); - languageIdentifers.add("pl"); - languageIdentifers.add("ps"); - languageIdentifers.add("pt"); - - languageIdentifers.add("qu"); - - languageIdentifers.add("rm"); - languageIdentifers.add("rn"); - languageIdentifers.add("ro"); - languageIdentifers.add("ru"); - languageIdentifers.add("rw"); - - languageIdentifers.add("sa"); - languageIdentifers.add("sc"); - languageIdentifers.add("sd"); - languageIdentifers.add("se"); - languageIdentifers.add("sg"); - languageIdentifers.add("sh"); - languageIdentifers.add("si"); - languageIdentifers.add("sk"); - languageIdentifers.add("sl"); - languageIdentifers.add("sm"); - languageIdentifers.add("sn"); - languageIdentifers.add("so"); - languageIdentifers.add("sq"); - languageIdentifers.add("sr"); - languageIdentifers.add("ss"); - languageIdentifers.add("st"); - languageIdentifers.add("su"); - languageIdentifers.add("sv"); - languageIdentifers.add("sw"); - - languageIdentifers.add("ta"); - languageIdentifers.add("te"); - languageIdentifers.add("tg"); - languageIdentifers.add("th"); - languageIdentifers.add("ti"); - languageIdentifers.add("tk"); - languageIdentifers.add("tl"); - languageIdentifers.add("tn"); - languageIdentifers.add("to"); - languageIdentifers.add("tr"); - languageIdentifers.add("ts"); - languageIdentifers.add("tt"); - languageIdentifers.add("tw"); - languageIdentifers.add("ty"); - - languageIdentifers.add("ug"); - languageIdentifers.add("uk"); - languageIdentifers.add("ur"); - languageIdentifers.add("uz"); - - languageIdentifers.add("ve"); - languageIdentifers.add("vi"); - languageIdentifers.add("vo"); - - languageIdentifers.add("wa"); - languageIdentifers.add("wo"); - - languageIdentifers.add("xh"); - - languageIdentifers.add("yi"); - languageIdentifers.add("yo"); - - languageIdentifers.add("za"); - languageIdentifers.add("zh"); - languageIdentifers.add("zu"); - - languageIdentifers.add("als"); - languageIdentifers.add("ang"); - languageIdentifers.add("arc"); - languageIdentifers.add("ast"); - languageIdentifers.add("bug"); - languageIdentifers.add("ceb"); - languageIdentifers.add("chr"); - languageIdentifers.add("chy"); - languageIdentifers.add("csb"); - languageIdentifers.add("frp"); - languageIdentifers.add("fur"); - languageIdentifers.add("got"); - languageIdentifers.add("haw"); - languageIdentifers.add("ilo"); - languageIdentifers.add("jbo"); - languageIdentifers.add("ksh"); - languageIdentifers.add("lad"); - languageIdentifers.add("lmo"); - languageIdentifers.add("nah"); - languageIdentifers.add("nap"); - languageIdentifers.add("nds"); - languageIdentifers.add("nrm"); - languageIdentifers.add("pam"); - languageIdentifers.add("pap"); - languageIdentifers.add("pdc"); - languageIdentifers.add("pih"); - languageIdentifers.add("pms"); - languageIdentifers.add("rmy"); - languageIdentifers.add("scn"); - languageIdentifers.add("sco"); - languageIdentifers.add("tet"); - languageIdentifers.add("tpi"); - languageIdentifers.add("tum"); - languageIdentifers.add("udm"); - languageIdentifers.add("vec"); - languageIdentifers.add("vls"); - languageIdentifers.add("war"); - languageIdentifers.add("xal"); - - languageIdentifers.add("simple"); - } - - private void initGermanVariables() { - templateParserClass = FlushTemplates.class; - //deleteTemplates.add( "Prettytable" ); - //parseTemplates.add( "Dieser Artikel" ); - //parseTemplates.add( "Audio" ); - //parseTemplates.add( "Video" ); - imageIdentifers.add("Bild"); - imageIdentifers.add("Image"); - imageIdentifers.add("Datei"); - categoryIdentifers.add("Kategorie"); - languageIdentifers.remove("de"); - } - - private void initEnglishVariables() { - templateParserClass = FlushTemplates.class; - - imageIdentifers.add("Image"); - imageIdentifers.add("File"); - imageIdentifers.add("media"); - categoryIdentifers.add("Category"); - languageIdentifers.remove("en"); - } - - private String resolveLineSeparator() { - if (lineSeparator.equals("CRLF")) { - return "\r\n"; - } - if (lineSeparator.equals("LF")) { - return "\n"; - } - - logger.error( - "LineSeparator is UNKNOWN: \"" + lineSeparator + "\"\n" + - "Set LineSeparator to \"LF\" or \"CRLF\" for a Error free configuration"); - - return lineSeparator; - } - - /** - * Creates a MediaWikiParser with the configurations which has been set. - */ - public MediaWikiParser createParser() { - logger.debug("Selected Parser: {}", parserClass); - - if (parserClass == ModularParser.class) { - ModularParser mwgp = new ModularParser( -// resolveLineSeparator(), - "\n", - languageIdentifers, - categoryIdentifers, - imageIdentifers, - showImageText, - deleteTags, - showMathTagContent, - calculateSrcSpans, - null); - - StringBuilder sb = new StringBuilder(); - sb.append(lineSeparator + "languageIdentifers: "); - for (String s : languageIdentifers) { - sb.append(s + " "); - } - sb.append(lineSeparator + "categoryIdentifers: "); - for (String s : categoryIdentifers) { - sb.append(s + " "); - } - sb.append(lineSeparator + "imageIdentifers: "); - for (String s : imageIdentifers) { - sb.append(s + " "); - } - logger.debug(sb.toString()); - - MediaWikiTemplateParser mwtp; - - logger.debug("Selected TemplateParser: {}", templateParserClass); - if (templateParserClass == GermanTemplateParser.class) { - for (String s : deleteTemplates) { - logger.debug("DeleteTemplate: '{}'", s); +public class MediaWikiParserFactory +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + private Class parserClass; + private Class templateParserClass; + private String lineSeparator; + private List<String> deleteTemplates; + private List<String> parseTemplates; + private List<String> categoryIdentifers; + private List<String> languageIdentifers; + private List<String> imageIdentifers; + private boolean showImageText; + private boolean deleteTags; + private boolean showMathTagContent; + private boolean calculateSrcSpans; + + /** + * Creates a new un-configured {@link MediaWikiParserFactory}. + */ + public MediaWikiParserFactory() + { + initVariables(); + } + + /** + * Creates a fully configured {@link MediaWikiParserFactory} for the specified + * {@link Language}.<br> + * Next step is {@link MediaWikiParserFactory#createParser()}. + */ + public MediaWikiParserFactory(Language language) + { + initVariables(); + if (language.equals(Language.german)) { + initGermanVariables(); + } + else if (language.equals(Language.english)) { + initEnglishVariables(); + } + else { + logger.warn("No language specific parser for '{}' available. Using default values.", + language); + } + } + + private void initVariables() + { + lineSeparator = "LF"; + parserClass = ModularParser.class; + imageIdentifers = new ArrayList<>(); + categoryIdentifers = new ArrayList<>(); + languageIdentifers = new ArrayList<>(); + deleteTemplates = new ArrayList<>(); + parseTemplates = new ArrayList<>(); + showImageText = false; + deleteTags = true; + showMathTagContent = true; + calculateSrcSpans = false; + templateParserClass = ShowTemplateNamesAndParameters.class; + + initLanguages(); + } + + private void initLanguages() + { + // Init the Languages... + languageIdentifers.add("aa"); + languageIdentifers.add("ab"); + languageIdentifers.add("af"); + languageIdentifers.add("am"); + languageIdentifers.add("an"); + languageIdentifers.add("ar"); + languageIdentifers.add("as"); + languageIdentifers.add("av"); + languageIdentifers.add("ay"); + languageIdentifers.add("az"); + + languageIdentifers.add("ba"); + languageIdentifers.add("be"); + languageIdentifers.add("bg"); + languageIdentifers.add("bh"); + languageIdentifers.add("bi"); + languageIdentifers.add("bm"); + languageIdentifers.add("bn"); + languageIdentifers.add("bo"); + languageIdentifers.add("br"); + languageIdentifers.add("bs"); + + languageIdentifers.add("ca"); + languageIdentifers.add("ce"); + languageIdentifers.add("ch"); + languageIdentifers.add("co"); + languageIdentifers.add("cr"); + languageIdentifers.add("cs"); + languageIdentifers.add("cv"); + languageIdentifers.add("cy"); + + languageIdentifers.add("da"); + languageIdentifers.add("de"); + languageIdentifers.add("dk"); + languageIdentifers.add("dv"); + languageIdentifers.add("dz"); + + languageIdentifers.add("ee"); + languageIdentifers.add("el"); + languageIdentifers.add("en"); + languageIdentifers.add("eo"); + languageIdentifers.add("es"); + languageIdentifers.add("et"); + languageIdentifers.add("eu"); + + languageIdentifers.add("fa"); + languageIdentifers.add("ff"); + languageIdentifers.add("fi"); + languageIdentifers.add("fj"); + languageIdentifers.add("fo"); + languageIdentifers.add("fr"); + languageIdentifers.add("fy"); + + languageIdentifers.add("ga"); + languageIdentifers.add("gd"); + languageIdentifers.add("gl"); + languageIdentifers.add("gn"); + languageIdentifers.add("gu"); + languageIdentifers.add("gv"); + + languageIdentifers.add("ha"); + languageIdentifers.add("he"); + languageIdentifers.add("hi"); + languageIdentifers.add("hr"); + languageIdentifers.add("ht"); + languageIdentifers.add("hu"); + languageIdentifers.add("hy"); + + languageIdentifers.add("ia"); + languageIdentifers.add("id"); + languageIdentifers.add("ie"); + languageIdentifers.add("ig"); + languageIdentifers.add("ii"); + languageIdentifers.add("ik"); + languageIdentifers.add("io"); + languageIdentifers.add("is"); + languageIdentifers.add("it"); + languageIdentifers.add("iu"); + + languageIdentifers.add("ja"); + languageIdentifers.add("jv"); + + languageIdentifers.add("ka"); + languageIdentifers.add("kg"); + languageIdentifers.add("ki"); + languageIdentifers.add("kk"); + languageIdentifers.add("kl"); + languageIdentifers.add("km"); + languageIdentifers.add("kn"); + languageIdentifers.add("ko"); + languageIdentifers.add("ks"); + languageIdentifers.add("ku"); + languageIdentifers.add("kv"); + languageIdentifers.add("kw"); + languageIdentifers.add("ky"); + + languageIdentifers.add("la"); + languageIdentifers.add("lb"); + languageIdentifers.add("li"); + languageIdentifers.add("ln"); + languageIdentifers.add("lo"); + languageIdentifers.add("lt"); + languageIdentifers.add("lv"); + + languageIdentifers.add("mg"); + languageIdentifers.add("mh"); + languageIdentifers.add("mi"); + languageIdentifers.add("mk"); + languageIdentifers.add("ml"); + languageIdentifers.add("mn"); + languageIdentifers.add("mo"); + languageIdentifers.add("mr"); + languageIdentifers.add("ms"); + languageIdentifers.add("mt"); + languageIdentifers.add("my"); + + languageIdentifers.add("na"); + languageIdentifers.add("nb"); + languageIdentifers.add("ne"); + languageIdentifers.add("ng"); + languageIdentifers.add("nl"); + languageIdentifers.add("nn"); + languageIdentifers.add("no"); + languageIdentifers.add("nv"); + languageIdentifers.add("ny"); + + languageIdentifers.add("oc"); + languageIdentifers.add("os"); + languageIdentifers.add("pa"); + languageIdentifers.add("pl"); + languageIdentifers.add("ps"); + languageIdentifers.add("pt"); + + languageIdentifers.add("qu"); + + languageIdentifers.add("rm"); + languageIdentifers.add("rn"); + languageIdentifers.add("ro"); + languageIdentifers.add("ru"); + languageIdentifers.add("rw"); + + languageIdentifers.add("sa"); + languageIdentifers.add("sc"); + languageIdentifers.add("sd"); + languageIdentifers.add("se"); + languageIdentifers.add("sg"); + languageIdentifers.add("sh"); + languageIdentifers.add("si"); + languageIdentifers.add("sk"); + languageIdentifers.add("sl"); + languageIdentifers.add("sm"); + languageIdentifers.add("sn"); + languageIdentifers.add("so"); + languageIdentifers.add("sq"); + languageIdentifers.add("sr"); + languageIdentifers.add("ss"); + languageIdentifers.add("st"); + languageIdentifers.add("su"); + languageIdentifers.add("sv"); + languageIdentifers.add("sw"); + + languageIdentifers.add("ta"); + languageIdentifers.add("te"); + languageIdentifers.add("tg"); + languageIdentifers.add("th"); + languageIdentifers.add("ti"); + languageIdentifers.add("tk"); + languageIdentifers.add("tl"); + languageIdentifers.add("tn"); + languageIdentifers.add("to"); + languageIdentifers.add("tr"); + languageIdentifers.add("ts"); + languageIdentifers.add("tt"); + languageIdentifers.add("tw"); + languageIdentifers.add("ty"); + + languageIdentifers.add("ug"); + languageIdentifers.add("uk"); + languageIdentifers.add("ur"); + languageIdentifers.add("uz"); + + languageIdentifers.add("ve"); + languageIdentifers.add("vi"); + languageIdentifers.add("vo"); + + languageIdentifers.add("wa"); + languageIdentifers.add("wo"); + + languageIdentifers.add("xh"); + + languageIdentifers.add("yi"); + languageIdentifers.add("yo"); + + languageIdentifers.add("za"); + languageIdentifers.add("zh"); + languageIdentifers.add("zu"); + + languageIdentifers.add("als"); + languageIdentifers.add("ang"); + languageIdentifers.add("arc"); + languageIdentifers.add("ast"); + languageIdentifers.add("bug"); + languageIdentifers.add("ceb"); + languageIdentifers.add("chr"); + languageIdentifers.add("chy"); + languageIdentifers.add("csb"); + languageIdentifers.add("frp"); + languageIdentifers.add("fur"); + languageIdentifers.add("got"); + languageIdentifers.add("haw"); + languageIdentifers.add("ilo"); + languageIdentifers.add("jbo"); + languageIdentifers.add("ksh"); + languageIdentifers.add("lad"); + languageIdentifers.add("lmo"); + languageIdentifers.add("nah"); + languageIdentifers.add("nap"); + languageIdentifers.add("nds"); + languageIdentifers.add("nrm"); + languageIdentifers.add("pam"); + languageIdentifers.add("pap"); + languageIdentifers.add("pdc"); + languageIdentifers.add("pih"); + languageIdentifers.add("pms"); + languageIdentifers.add("rmy"); + languageIdentifers.add("scn"); + languageIdentifers.add("sco"); + languageIdentifers.add("tet"); + languageIdentifers.add("tpi"); + languageIdentifers.add("tum"); + languageIdentifers.add("udm"); + languageIdentifers.add("vec"); + languageIdentifers.add("vls"); + languageIdentifers.add("war"); + languageIdentifers.add("xal"); + + languageIdentifers.add("simple"); + } + + private void initGermanVariables() + { + templateParserClass = FlushTemplates.class; + // deleteTemplates.add( "Prettytable" ); + // parseTemplates.add( "Dieser Artikel" ); + // parseTemplates.add( "Audio" ); + // parseTemplates.add( "Video" ); + imageIdentifers.add("Bild"); + imageIdentifers.add("Image"); + imageIdentifers.add("Datei"); + categoryIdentifers.add("Kategorie"); + languageIdentifers.remove("de"); + } + + private void initEnglishVariables() + { + templateParserClass = FlushTemplates.class; + + imageIdentifers.add("Image"); + imageIdentifers.add("File"); + imageIdentifers.add("media"); + categoryIdentifers.add("Category"); + languageIdentifers.remove("en"); + } + + private String resolveLineSeparator() + { + if (lineSeparator.equals("CRLF")) { + return "\r\n"; + } + if (lineSeparator.equals("LF")) { + return "\n"; + } + + logger.error("LineSeparator is UNKNOWN: \"" + lineSeparator + "\"\n" + + "Set LineSeparator to \"LF\" or \"CRLF\" for a Error free configuration"); + + return lineSeparator; + } + + /** + * Creates a MediaWikiParser with the configurations which has been set. + */ + public MediaWikiParser createParser() + { + logger.debug("Selected Parser: {}", parserClass); + + if (parserClass == ModularParser.class) { + ModularParser mwgp = new ModularParser( + // resolveLineSeparator(), + "\n", languageIdentifers, categoryIdentifers, imageIdentifers, showImageText, + deleteTags, showMathTagContent, calculateSrcSpans, null); + + StringBuilder sb = new StringBuilder(); + sb.append(lineSeparator + "languageIdentifers: "); + for (String s : languageIdentifers) { + sb.append(s + " "); + } + sb.append(lineSeparator + "categoryIdentifers: "); + for (String s : categoryIdentifers) { + sb.append(s + " "); + } + sb.append(lineSeparator + "imageIdentifers: "); + for (String s : imageIdentifers) { + sb.append(s + " "); + } + logger.debug(sb.toString()); + + MediaWikiTemplateParser mwtp; + + logger.debug("Selected TemplateParser: {}", templateParserClass); + if (templateParserClass == GermanTemplateParser.class) { + for (String s : deleteTemplates) { + logger.debug("DeleteTemplate: '{}'", s); + } + for (String s : parseTemplates) { + logger.debug("ParseTemplate: '{}'", s); + } + mwtp = new GermanTemplateParser(mwgp, deleteTemplates, parseTemplates); + } + else if (templateParserClass == FlushTemplates.class) { + mwtp = new FlushTemplates(); + } + else if (templateParserClass == ShowTemplateNamesAndParameters.class) { + mwtp = new ShowTemplateNamesAndParameters(); + } + else { + logger.error("TemplateParser Class Not Found!"); + return null; + } + + mwgp.setTemplateParser(mwtp); + + return mwgp; } - for (String s : parseTemplates) { - logger.debug("ParseTemplate: '{}'", s); + else { + logger.error("Parser Class Not Found!"); + return null; } - mwtp = new GermanTemplateParser(mwgp, deleteTemplates, parseTemplates); - } else if (templateParserClass == FlushTemplates.class) { - mwtp = new FlushTemplates(); - } else if (templateParserClass == ShowTemplateNamesAndParameters.class) { - mwtp = new ShowTemplateNamesAndParameters(); - } else { - logger.error("TemplateParser Class Not Found!"); - return null; - } - - mwgp.setTemplateParser(mwtp); - - return mwgp; - } else { - logger.error("Parser Class Not Found!"); - return null; - } - } - - /** - * Adds a Template which should be deleted while the parsing process. - */ - public void addDeleteTemplate(String deleteTemplate) { - deleteTemplates.add(deleteTemplate); - } - - /** - * Adds a Template which should be "parsed" while the parsing process. - */ - public void addParseTemplate(String parseTemplate) { - parseTemplates.add(parseTemplate); - } - - /** - * Retuns the Class of the selected Parser. - */ - public Class getParserClass() { - return parserClass; - } - - /** - * Set the Parser which should be configurated and returned by createParser(). - */ - public void setParserClass(Class parserClass) { - this.parserClass = parserClass; - } - - /** - * Returns the Class of the selected TemplateParser. - */ - public Class getTemplateParserClass() { - return templateParserClass; - } - - /** - * Set the Parser which should be used for Template parsing. - */ - public void setTemplateParserClass(Class templateParserClass) { - this.templateParserClass = templateParserClass; - } - - /** - * Retuns the List of templates which should be deleted in the parseing process. - */ - public List<String> getDeleteTemplates() { - return deleteTemplates; - } - - /** - * Set the List of templates which should be deleted in the parseing process. - */ - public void setDeleteTemplates(List<String> deleteTemplates) { - this.deleteTemplates = deleteTemplates; - } - - /** - * Returns the CharSequence/String which should be used as line separator. - */ - public String getLineSeparator() { - return lineSeparator; - } - - /** - * Sets the CharSequence/String which should be used as line separator. - */ - public void setLineSeparator(String lineSeparator) { - this.lineSeparator = lineSeparator; - } - - /** - * Returns the List of templates which should be "parsed" in the parseing process. - */ - public List<String> getParseTemplates() { - return parseTemplates; - } - - /** - * Sets the List of templates which should be "parsed" in the parseing process. - */ - public void setParseTemplates(List<String> parseTemplates) { - this.parseTemplates = parseTemplates; - } - - /** - * Returns the List of Strings which are used to specifiy that a link is a link to a - * wikipedia i another language. - */ - public List<String> getLanguageIdentifers() { - return languageIdentifers; - } - - /** - * Sets the list of language identifiers. - */ - public void setLanguageIdentifers(List<String> languageIdentifers) { - this.languageIdentifers = languageIdentifers; - } - - /** - * Returns the List of Strings which are used to specifiy that a link is a link to a - * cathegory. E.g. in german "Kathegorie" is used. But it could be usefull to use more - * than one identifier, mainly the english identifier "cathegory" should be used too. - */ - public List<String> getCategoryIdentifers() { - return categoryIdentifers; - } - - /** - * Set the list of cathegory identifers. - */ - public void setCategoryIdentifers(List<String> categoryIdentifers) { - this.categoryIdentifers = categoryIdentifers; - } - - /** - * Returns the List of Strings which are used to specifiy that a link is an Image. - */ - public List<String> getImageIdentifers() { - return imageIdentifers; - } - - /** - * Sets the image identifer list. - */ - public void setImageIdentifers(List<String> imageIdentifers) { - this.imageIdentifers = imageIdentifers; - } - - /** - * Returns if the Parser should show the Text of an Image, or delete it. If the Text is deleted, - * it will be added as a Parameter to the Link. - * - * @return true, if the Text should be shown. - */ - public boolean getShowImageText() { - return showImageText; - } - - /** - * Sets if the Parser should show the Text of an Image, or delete it. - */ - public void setShowImageText(boolean showImageText) { - this.showImageText = showImageText; - } - - /** - * Returns if < * > tags should be deleted or annotaded. - * - * @return true if the tags should be deleted. - */ - public boolean getDeleteTags() { - return deleteTags; - } - - /** - * Sets if < * > tags should be deleted or annotaded. - */ - public void setDeleteTags(boolean deleteTags) { - this.deleteTags = deleteTags; - } - - /** - * Retruns if the Content of math tags (<math><CONTENT/math>) should be deleted or - * annotated. - * - * @return true, if the tag content should be annotated. - */ - public boolean getShowMathTagContent() { - return showMathTagContent; - } - - /** - * Set if the Contetn of math tags should be deleted or annotated. - */ - public void setShowMathTagContent(boolean showMathTagContent) { - this.showMathTagContent = showMathTagContent; - } - - /** - * Returns if the Parser should calculate the positions in the original source of the elements - * which are parsed. - * - * @return true, if the positions should be calulated. - */ - public boolean getCalculateSrcSpans() { - return calculateSrcSpans; - } - - /** - * Sets if the Parser should calculate the positions in the original source of the elements - * which are parsed. - */ - public void setCalculateSrcSpans(boolean calculateSrcSpans) { - this.calculateSrcSpans = calculateSrcSpans; - } + } + + /** + * Adds a Template which should be deleted while the parsing process. + */ + public void addDeleteTemplate(String deleteTemplate) + { + deleteTemplates.add(deleteTemplate); + } + + /** + * Adds a Template which should be "parsed" while the parsing process. + */ + public void addParseTemplate(String parseTemplate) + { + parseTemplates.add(parseTemplate); + } + + /** + * Retuns the Class of the selected Parser. + */ + public Class getParserClass() + { + return parserClass; + } + + /** + * Set the Parser which should be configurated and returned by createParser(). + */ + public void setParserClass(Class parserClass) + { + this.parserClass = parserClass; + } + + /** + * Returns the Class of the selected TemplateParser. + */ + public Class getTemplateParserClass() + { + return templateParserClass; + } + + /** + * Set the Parser which should be used for Template parsing. + */ + public void setTemplateParserClass(Class templateParserClass) + { + this.templateParserClass = templateParserClass; + } + + /** + * Retuns the List of templates which should be deleted in the parseing process. + */ + public List<String> getDeleteTemplates() + { + return deleteTemplates; + } + + /** + * Set the List of templates which should be deleted in the parseing process. + */ + public void setDeleteTemplates(List<String> deleteTemplates) + { + this.deleteTemplates = deleteTemplates; + } + + /** + * Returns the CharSequence/String which should be used as line separator. + */ + public String getLineSeparator() + { + return lineSeparator; + } + + /** + * Sets the CharSequence/String which should be used as line separator. + */ + public void setLineSeparator(String lineSeparator) + { + this.lineSeparator = lineSeparator; + } + + /** + * Returns the List of templates which should be "parsed" in the parseing process. + */ + public List<String> getParseTemplates() + { + return parseTemplates; + } + + /** + * Sets the List of templates which should be "parsed" in the parseing process. + */ + public void setParseTemplates(List<String> parseTemplates) + { + this.parseTemplates = parseTemplates; + } + + /** + * Returns the List of Strings which are used to specifiy that a link is a link to a wikipedia i + * another language. + */ + public List<String> getLanguageIdentifers() + { + return languageIdentifers; + } + + /** + * Sets the list of language identifiers. + */ + public void setLanguageIdentifers(List<String> languageIdentifers) + { + this.languageIdentifers = languageIdentifers; + } + + /** + * Returns the List of Strings which are used to specifiy that a link is a link to a cathegory. + * E.g. in german "Kathegorie" is used. But it could be usefull to use more than one identifier, + * mainly the english identifier "cathegory" should be used too. + */ + public List<String> getCategoryIdentifers() + { + return categoryIdentifers; + } + + /** + * Set the list of cathegory identifers. + */ + public void setCategoryIdentifers(List<String> categoryIdentifers) + { + this.categoryIdentifers = categoryIdentifers; + } + + /** + * Returns the List of Strings which are used to specifiy that a link is an Image. + */ + public List<String> getImageIdentifers() + { + return imageIdentifers; + } + + /** + * Sets the image identifer list. + */ + public void setImageIdentifers(List<String> imageIdentifers) + { + this.imageIdentifers = imageIdentifers; + } + + /** + * Returns if the Parser should show the Text of an Image, or delete it. If the Text is deleted, + * it will be added as a Parameter to the Link. + * + * @return true, if the Text should be shown. + */ + public boolean getShowImageText() + { + return showImageText; + } + + /** + * Sets if the Parser should show the Text of an Image, or delete it. + */ + public void setShowImageText(boolean showImageText) + { + this.showImageText = showImageText; + } + + /** + * Returns if < * > tags should be deleted or annotaded. + * + * @return true if the tags should be deleted. + */ + public boolean getDeleteTags() + { + return deleteTags; + } + + /** + * Sets if < * > tags should be deleted or annotaded. + */ + public void setDeleteTags(boolean deleteTags) + { + this.deleteTags = deleteTags; + } + + /** + * Retruns if the Content of math tags (<math><CONTENT/math>) should be deleted or + * annotated. + * + * @return true, if the tag content should be annotated. + */ + public boolean getShowMathTagContent() + { + return showMathTagContent; + } + + /** + * Set if the Contetn of math tags should be deleted or annotated. + */ + public void setShowMathTagContent(boolean showMathTagContent) + { + this.showMathTagContent = showMathTagContent; + } + + /** + * Returns if the Parser should calculate the positions in the original source of the elements + * which are parsed. + * + * @return true, if the positions should be calulated. + */ + public boolean getCalculateSrcSpans() + { + return calculateSrcSpans; + } + + /** + * Sets if the Parser should calculate the positions in the original source of the elements + * which are parsed. + */ + public void setCalculateSrcSpans(boolean calculateSrcSpans) + { + this.calculateSrcSpans = calculateSrcSpans; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java index f14cf580..09a5b763 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/MediaWikiTemplateParser.java @@ -21,23 +21,21 @@ import org.dkpro.jwpl.parser.Template; /** - * Because template parsing is a special task, it is usesfull to use - * a special parser. + * Because template parsing is a special task, it is usesfull to use a special parser. */ -public interface MediaWikiTemplateParser { +public interface MediaWikiTemplateParser +{ - /** - * Takes a Template and do whatever is required for handling this Template. - * It is possible to delete this template, to parse it to e.g a Link or - * to return MediaWiki code which can be parsed by a MediaWiki parser.<br> - * If you are interested how this works, you shoud read the documentation - * of ResolvedTemplate. - */ - ResolvedTemplate parseTemplate(Template t, ParsedPage pp); + /** + * Takes a Template and do whatever is required for handling this Template. It is possible to + * delete this template, to parse it to e.g a Link or to return MediaWiki code which can be + * parsed by a MediaWiki parser.<br> + * If you are interested how this works, you shoud read the documentation of ResolvedTemplate. + */ + ResolvedTemplate parseTemplate(Template t, ParsedPage pp); - /** - * Returns some information about what the TemplateParser does am how - * it is configurated. - */ - String configurationInfo(); + /** + * Returns some information about what the TemplateParser does am how it is configurated. + */ + String configurationInfo(); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java index e8bce845..19337bb5 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ModularParser.java @@ -17,7 +17,6 @@ */ package org.dkpro.jwpl.parser.mediawiki; - import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.LinkedList; @@ -44,1807 +43,1848 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; - /** * This is a parser for MediaWiki Source. * <p> * It exist a {@link MediaWikiParserFactory}, to get an instance of this Parser.<br> */ -public class ModularParser implements MediaWikiParser, - MediaWikiContentElementParser { - - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); - - // Options, set by the ParserFactory - private String lineSeparator; - private List<String> categoryIdentifers; - private List<String> languageIdentifers; - private List<String> imageIdentifers; - private MediaWikiTemplateParser templateParser; - private boolean showImageText = false; - private boolean deleteTags = true; - private boolean showMathTagContent = true; - private boolean calculateSrcSpans = true; - - /** - * Creates a un-configured {@link ModularParser}... - */ - public ModularParser() { - } - - /** - * Creates a fully configured {@link ModularParser}... - */ - public ModularParser(String lineSeparator, List<String> languageIdentifers, - List<String> categoryIdentifers, List<String> imageIdentifers, - boolean showImageText, boolean deleteTags, - boolean showMathTagContent, boolean calculateSrcSpans, - MediaWikiTemplateParser templateParser) { - - setLineSeparator(lineSeparator); - setLanguageIdentifers(languageIdentifers); - setCategoryIdentifers(categoryIdentifers); - setImageIdentifers(imageIdentifers); - setShowImageText(showImageText); - setDeleteTags(deleteTags); - setShowMathTagContent(showMathTagContent); - setCalculateSrcSpans(calculateSrcSpans); - setTemplateParser(templateParser); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - @Override - public String getLineSeparator() { - return lineSeparator; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setLineSeparator(String lineSeparator) { - this.lineSeparator = lineSeparator; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public List<String> getLanguageIdentifers() { - return languageIdentifers; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setLanguageIdentifers(List<String> languageIdentifers) { - this.languageIdentifers = listToLowerCase(languageIdentifers); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public List<String> getCategoryIdentifers() { - return categoryIdentifers; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setCategoryIdentifers(List<String> categoryIdentifers) { - this.categoryIdentifers = listToLowerCase(categoryIdentifers); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public List<String> getImageIdentifers() { - return imageIdentifers; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setImageIdentifers(List<String> imageIdentifers) { - this.imageIdentifers = listToLowerCase(imageIdentifers); - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public MediaWikiTemplateParser getTemplateParser() { - return templateParser; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setTemplateParser(MediaWikiTemplateParser templateParser) { - this.templateParser = templateParser; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean showImageText() { - return showImageText; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setShowImageText(boolean showImageText) { - this.showImageText = showImageText; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean deleteTags() { - return deleteTags; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setDeleteTags(boolean deleteTags) { - this.deleteTags = deleteTags; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean showMathTagContent() { - return showMathTagContent; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setShowMathTagContent(boolean showMathTagContent) { - this.showMathTagContent = showMathTagContent; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public boolean calculateSrcSpans() { - return calculateSrcSpans; - } - - /** - * Look at {@link MediaWikiParserFactory} for a description... - */ - public void setCalculateSrcSpans(boolean calculateSrcSpans) { - this.calculateSrcSpans = calculateSrcSpans; - } - - /** - * Converts a List of Strings to lower case Strings. - */ - private List<String> listToLowerCase(List<String> l) { - List<String> result = new ArrayList<>(); - for (String s : l) { - result.add(s.toLowerCase()); - } - return result; - } - - /** - * Look at the MediaWikiParser interface for a description... - */ - @Override - public String configurationInfo() { - StringBuilder result = new StringBuilder(); - - result.append("MediaWikiParser configuration:\n"); - result.append("ParserClass: " + this.getClass() + "\n"); - result.append("ShowImageText: " + showImageText + "\n"); - result.append("DeleteTags: " + deleteTags + "\n"); - result.append("ShowMathTagContent: " + showMathTagContent + "\n"); - result.append("CalculateSrcSpans: " + calculateSrcSpans + "\n"); - - result.append("LanguageIdentifers: "); - for (String s : languageIdentifers) { - result.append(s + " "); - } - result.append("\n"); +public class ModularParser + implements MediaWikiParser, MediaWikiContentElementParser +{ + + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); + + // Options, set by the ParserFactory + private String lineSeparator; + private List<String> categoryIdentifers; + private List<String> languageIdentifers; + private List<String> imageIdentifers; + private MediaWikiTemplateParser templateParser; + private boolean showImageText = false; + private boolean deleteTags = true; + private boolean showMathTagContent = true; + private boolean calculateSrcSpans = true; + + /** + * Creates a un-configured {@link ModularParser}... + */ + public ModularParser() + { + } + + /** + * Creates a fully configured {@link ModularParser}... + */ + public ModularParser(String lineSeparator, List<String> languageIdentifers, + List<String> categoryIdentifers, List<String> imageIdentifers, boolean showImageText, + boolean deleteTags, boolean showMathTagContent, boolean calculateSrcSpans, + MediaWikiTemplateParser templateParser) + { + + setLineSeparator(lineSeparator); + setLanguageIdentifers(languageIdentifers); + setCategoryIdentifers(categoryIdentifers); + setImageIdentifers(imageIdentifers); + setShowImageText(showImageText); + setDeleteTags(deleteTags); + setShowMathTagContent(showMathTagContent); + setCalculateSrcSpans(calculateSrcSpans); + setTemplateParser(templateParser); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + @Override + public String getLineSeparator() + { + return lineSeparator; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setLineSeparator(String lineSeparator) + { + this.lineSeparator = lineSeparator; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public List<String> getLanguageIdentifers() + { + return languageIdentifers; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setLanguageIdentifers(List<String> languageIdentifers) + { + this.languageIdentifers = listToLowerCase(languageIdentifers); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public List<String> getCategoryIdentifers() + { + return categoryIdentifers; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setCategoryIdentifers(List<String> categoryIdentifers) + { + this.categoryIdentifers = listToLowerCase(categoryIdentifers); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public List<String> getImageIdentifers() + { + return imageIdentifers; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setImageIdentifers(List<String> imageIdentifers) + { + this.imageIdentifers = listToLowerCase(imageIdentifers); + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public MediaWikiTemplateParser getTemplateParser() + { + return templateParser; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setTemplateParser(MediaWikiTemplateParser templateParser) + { + this.templateParser = templateParser; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean showImageText() + { + return showImageText; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setShowImageText(boolean showImageText) + { + this.showImageText = showImageText; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean deleteTags() + { + return deleteTags; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setDeleteTags(boolean deleteTags) + { + this.deleteTags = deleteTags; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean showMathTagContent() + { + return showMathTagContent; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setShowMathTagContent(boolean showMathTagContent) + { + this.showMathTagContent = showMathTagContent; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public boolean calculateSrcSpans() + { + return calculateSrcSpans; + } + + /** + * Look at {@link MediaWikiParserFactory} for a description... + */ + public void setCalculateSrcSpans(boolean calculateSrcSpans) + { + this.calculateSrcSpans = calculateSrcSpans; + } + + /** + * Converts a List of Strings to lower case Strings. + */ + private List<String> listToLowerCase(List<String> l) + { + List<String> result = new ArrayList<>(); + for (String s : l) { + result.add(s.toLowerCase()); + } + return result; + } + + /** + * Look at the MediaWikiParser interface for a description... + */ + @Override + public String configurationInfo() + { + StringBuilder result = new StringBuilder(); + + result.append("MediaWikiParser configuration:\n"); + result.append("ParserClass: " + this.getClass() + "\n"); + result.append("ShowImageText: " + showImageText + "\n"); + result.append("DeleteTags: " + deleteTags + "\n"); + result.append("ShowMathTagContent: " + showMathTagContent + "\n"); + result.append("CalculateSrcSpans: " + calculateSrcSpans + "\n"); + + result.append("LanguageIdentifers: "); + for (String s : languageIdentifers) { + result.append(s + " "); + } + result.append("\n"); - result.append("CategoryIdentifers: "); - for (String s : categoryIdentifers) { - result.append(s + " "); - } - result.append("\n"); + result.append("CategoryIdentifers: "); + for (String s : categoryIdentifers) { + result.append(s + " "); + } + result.append("\n"); + + result.append("ImageIdentifers: "); + for (String s : imageIdentifers) { + result.append(s + " "); + } + result.append("\n"); - result.append("ImageIdentifers: "); - for (String s : imageIdentifers) { - result.append(s + " "); + result.append("TemplateParser: " + templateParser.getClass() + "\n"); + result.append(templateParser.configurationInfo()); + + return result.toString(); } - result.append("\n"); - result.append("TemplateParser: " + templateParser.getClass() + "\n"); - result.append(templateParser.configurationInfo()); + /** + * Checks if the configuration is runnable. + */ + private boolean runConfig() + { + if (lineSeparator == null) { + logger.debug("Set lineSeparator"); + return false; + } + if (categoryIdentifers == null) { + logger.warn("Set categoryIdentifers"); + return false; + } + if (languageIdentifers == null) { + logger.warn("Set languageIdentifers"); + return false; + } + if (imageIdentifers == null) { + logger.warn("Set imageIdentifers"); + return false; + } + if (templateParser == null) { + logger.warn("Set templateParser"); + return false; + } + return true; + } + + /** + * Look at the {@link MediaWikiParser} for a description... + */ + @Override + public ParsedPage parse(String src) + { + // check if the configuration is runnable. + if (!runConfig()) { + return null; + } - return result.toString(); - } + // check if the is something to parse. sometimes there is an empty string + // due to an error of other classes... + if (src == null || src.length() == 0) { + return null; + } - /** - * Checks if the configuration is runnable. - */ - private boolean runConfig() { - if (lineSeparator == null) { - logger.debug("Set lineSeparator"); - return false; - } - if (categoryIdentifers == null) { - logger.warn("Set categoryIdentifers"); - return false; - } - if (languageIdentifers == null) { - logger.warn("Set languageIdentifers"); - return false; - } - if (imageIdentifers == null) { - logger.warn("Set imageIdentifers"); - return false; - } - if (templateParser == null) { - logger.warn("Set templateParser"); - return false; - } - return true; - } - - /** - * Look at the {@link MediaWikiParser} for a description... - */ - @Override - public ParsedPage parse(String src) { - // check if the configuration is runnable. - if (!runConfig()) { - return null; - } + // creates a new span manager with the given source, appending a newline + // to avoid errors. + SpanManager sm = new SpanManager(src.replace('\t', ' ') + lineSeparator); + if (calculateSrcSpans) { + sm.enableSrcPosCalculation(); + } - // check if the is something to parse. sometimes there is an empty string - // due to an error of other classes... - if (src == null || src.length() == 0) { - return null; - } + // Creating a new ParsePage, which will be filled with information in + // the parseing process. + ParsedPage ppResult = new ParsedPage(); - // creates a new span manager with the given source, appending a newline - // to avoid errors. - SpanManager sm = new SpanManager(src.replace('\t', ' ') + lineSeparator); - if (calculateSrcSpans) { - sm.enableSrcPosCalculation(); - } + // Creating a new Parameter Container + ContentElementParsingParameters cepp = new ContentElementParsingParameters(); - // Creating a new ParsePage, which will be filled with information in - // the parseing process. - ParsedPage ppResult = new ParsedPage(); + // Deletes comments out of the Source + deleteComments(sm); - // Creating a new Parameter Container - ContentElementParsingParameters cepp = new ContentElementParsingParameters(); + // Deletes any TOC Tags, these are not usesd in this parser. + deleteTOCTag(sm); - // Deletes comments out of the Source - deleteComments(sm); + // Removing the Content which should not parsed but integrated later in + // the resulting text + sm.manageList(cepp.noWikiSpans); + parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "PRE", " "); + parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "NOWIKI"); + if (cepp.noWikiSpans.size() == 0) { + sm.removeManagedList(cepp.noWikiSpans); + } - // Deletes any TOC Tags, these are not usesd in this parser. - deleteTOCTag(sm); + // Parseing the Math Tags... + sm.manageList(cepp.mathSpans); + parseSpecifiedTag(sm, cepp.mathSpans, cepp.mathStrings, "MATH"); + if (cepp.mathSpans.size() == 0) { + sm.removeManagedList(cepp.mathSpans); + } - // Removing the Content which should not parsed but integrated later in - // the resulting text - sm.manageList(cepp.noWikiSpans); - parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "PRE", " "); - parseSpecifiedTag(sm, cepp.noWikiSpans, cepp.noWikiStrings, "NOWIKI"); - if (cepp.noWikiSpans.size() == 0) { - sm.removeManagedList(cepp.noWikiSpans); - } + // Parseing the Templates (the Span List will be added to the managed + // lists by the function) + parseTemplates(sm, cepp.templateSpans, cepp.templates, ppResult); - // Parseing the Math Tags... - sm.manageList(cepp.mathSpans); - parseSpecifiedTag(sm, cepp.mathSpans, cepp.mathStrings, "MATH"); - if (cepp.mathSpans.size() == 0) { - sm.removeManagedList(cepp.mathSpans); - } + // Parsing all other Tags + parseTags(sm, cepp.tagSpans); - // Parseing the Templates (the Span List will be added to the managed - // lists by the function) - parseTemplates(sm, cepp.templateSpans, cepp.templates, ppResult); + // Converting <gallery>s to normal Images, this is not beautiful, but + // a simple solution.. + convertGalleriesToImages(sm, cepp.tagSpans); - // Parsing all other Tags - parseTags(sm, cepp.tagSpans); + // Parsing Links and Images. + parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); - // Converting <gallery>s to normal Images, this is not beautiful, but - // a simple solution.. - convertGalleriesToImages(sm, cepp.tagSpans); + // Creating a list of Line Spans to work with lines in the following + // functions + LinkedList<Span> lineSpans = new LinkedList<>(); + getLineSpans(sm, lineSpans); - // Parsing Links and Images. - parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); + // Removing the Category Links from the Links list, and crating an + // ContentElement for these links... + ppResult.setCategoryElement( + getSpecialLinks(sm, cepp.linkSpans, cepp.links, " - ", categoryIdentifers)); - // Creating a list of Line Spans to work with lines in the following - // functions - LinkedList<Span> lineSpans = new LinkedList<>(); - getLineSpans(sm, lineSpans); + // Removing the Language Links from the Links list, and crating an + // ContentElement for these links... + ppResult.setLanguagesElement( + getSpecialLinks(sm, cepp.linkSpans, cepp.links, " - ", languageIdentifers)); - // Removing the Category Links from the Links list, and crating an - // ContentElement for these links... - ppResult.setCategoryElement(getSpecialLinks(sm, cepp.linkSpans, - cepp.links, " - ", categoryIdentifers)); + // Parsing and Setting the Sections... the main work is done in parse + // sections! + ppResult.setSections( + EmptyStructureRemover.eliminateEmptyStructures(parseSections(sm, cepp, lineSpans))); - // Removing the Language Links from the Links list, and crating an - // ContentElement for these links... - ppResult.setLanguagesElement(getSpecialLinks(sm, cepp.linkSpans, - cepp.links, " - ", languageIdentifers)); + // Finding and Setting the paragraph which is concidered as the "First" + setFirstParagraph(ppResult); - // Parsing and Setting the Sections... the main work is done in parse - // sections! - ppResult.setSections(EmptyStructureRemover - .eliminateEmptyStructures(parseSections(sm, cepp, lineSpans))); + // check the calculated source positions, and reset them if necessary. + if (calculateSrcSpans) { + SrcPosRangeChecker.checkRange(ppResult); + } - // Finding and Setting the paragraph which is concidered as the "First" - setFirstParagraph(ppResult); + // So it is done... + return ppResult; + } + + /** + * Deleting all comments out of the SpanManager...<br> + * <!-- COMMENT --> + */ + private void deleteComments(SpanManager sm) + { + int start = 0; + while ((start = sm.indexOf("<!--", start)) != -1) { + int end = sm.indexOf("-->", start + 4) + 3; + if (end == -1 + 3) { + end = sm.length(); + } - // check the calculated source positions, and reset them if necessary. - if (calculateSrcSpans) { - SrcPosRangeChecker.checkRange(ppResult); - } + // Remove the one lineSeparator too, if the whole line is a comment! + try { + if (lineSeparator.equals(sm.substring(start - lineSeparator.length(), start)) + && lineSeparator.equals(sm.substring(end, end + lineSeparator.length()))) { + end += lineSeparator.length(); + } + } + catch (IndexOutOfBoundsException e) { + } - // So it is done... - return ppResult; - } - - - /** - * Deleting all comments out of the SpanManager...<br> - * <!-- COMMENT --> - */ - private void deleteComments(SpanManager sm) { - int start = 0; - while ((start = sm.indexOf("<!--", start)) != -1) { - int end = sm.indexOf("-->", start + 4) + 3; - if (end == -1 + 3) { - end = sm.length(); - } - - // Remove the one lineSeparator too, if the whole line is a comment! - try { - if (lineSeparator.equals(sm.substring(start - - lineSeparator.length(), start)) - && lineSeparator.equals(sm.substring(end, end - + lineSeparator.length()))) { - end += lineSeparator.length(); - } - } catch (IndexOutOfBoundsException e) { - } - - sm.delete(start, end); - } - } - - /** - * Deleteing ALL TOC Tags - */ - private void deleteTOCTag(SpanManager sm) { - // delete all __TOC__ from SRC - int temp = 0; - while ((temp = sm.indexOf("__TOC__", temp)) != -1) { - sm.delete(temp, temp + 2 + 3 + 2); + sm.delete(start, end); + } } - // delete all __NOTOC__ from SRC - temp = 0; - while ((temp = sm.indexOf("__NOTOC__", temp)) != -1) { - sm.delete(temp, temp + 2 + 5 + 2); - } - } - - private ContentElement getSpecialLinks(SpanManager sm, - List<Span> linkSpans, List<Link> links, String linkSpacer, - List<String> identifers) { - ContentElement result = new ContentElement(); - StringBuilder text = new StringBuilder(); - List<Link> localLinks = new ArrayList<>(); - - for (int i = links.size() - 1; i >= 0; i--) { - String identifer = getLinkNameSpace(links.get(i).getTarget()); - - if (identifer != null && identifers.indexOf(identifer) != -1) { - Link l = links.remove(i); - Span s = linkSpans.remove(i); - String linkText = sm.substring(s); - sm.delete(s); - l.setHomeElement(result); - s.adjust(-s.getStart() + text.length()); - text.append(linkText + linkSpacer); - localLinks.add(l); - //TODO add type? - } - } + /** + * Deleteing ALL TOC Tags + */ + private void deleteTOCTag(SpanManager sm) + { + // delete all __TOC__ from SRC + int temp = 0; + while ((temp = sm.indexOf("__TOC__", temp)) != -1) { + sm.delete(temp, temp + 2 + 3 + 2); + } - int len = text.length(); - if (len != 0) { - text.delete(len - linkSpacer.length(), len); + // delete all __NOTOC__ from SRC + temp = 0; + while ((temp = sm.indexOf("__NOTOC__", temp)) != -1) { + sm.delete(temp, temp + 2 + 5 + 2); + } } - result.setText(text.toString()); - result.setLinks(localLinks); + private ContentElement getSpecialLinks(SpanManager sm, List<Span> linkSpans, List<Link> links, + String linkSpacer, List<String> identifers) + { + ContentElement result = new ContentElement(); + StringBuilder text = new StringBuilder(); + List<Link> localLinks = new ArrayList<>(); + + for (int i = links.size() - 1; i >= 0; i--) { + String identifer = getLinkNameSpace(links.get(i).getTarget()); + + if (identifer != null && identifers.indexOf(identifer) != -1) { + Link l = links.remove(i); + Span s = linkSpans.remove(i); + String linkText = sm.substring(s); + sm.delete(s); + l.setHomeElement(result); + s.adjust(-s.getStart() + text.length()); + text.append(linkText + linkSpacer); + localLinks.add(l); + // TODO add type? + } + } + + int len = text.length(); + if (len != 0) { + text.delete(len - linkSpacer.length(), len); + } - if (result.empty()) { - return null; - } else { - return result; + result.setText(text.toString()); + result.setLinks(localLinks); + + if (result.empty()) { + return null; + } + else { + return result; + } } - } - private void getLineSpans(SpanManager sm, LinkedList<Span> lineSpans) { - sm.manageList(lineSpans); + private void getLineSpans(SpanManager sm, LinkedList<Span> lineSpans) + { + sm.manageList(lineSpans); - int start = 0; - int end; + int start = 0; + int end; - while ((end = sm.indexOf(lineSeparator, start)) != -1) { - lineSpans.add(new Span(start, end).trimTrail(sm)); - start = end + lineSeparator.length(); - } - lineSpans.add(new Span(start, sm.length()).trimTrail(sm)); + while ((end = sm.indexOf(lineSeparator, start)) != -1) { + lineSpans.add(new Span(start, end).trimTrail(sm)); + start = end + lineSeparator.length(); + } + lineSpans.add(new Span(start, sm.length()).trimTrail(sm)); - while (!lineSpans.isEmpty() && lineSpans.getFirst().length() == 0) { - lineSpans.removeFirst(); - } - while (!lineSpans.isEmpty() && lineSpans.getLast().length() == 0) { - lineSpans.removeLast(); + while (!lineSpans.isEmpty() && lineSpans.getFirst().length() == 0) { + lineSpans.removeFirst(); + } + while (!lineSpans.isEmpty() && lineSpans.getLast().length() == 0) { + lineSpans.removeLast(); + } } - } - private SectionContainer parseSections(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) { + private SectionContainer parseSections(SpanManager sm, ContentElementParsingParameters cepp, + LinkedList<Span> lineSpans) + { - List<SectionContent> contentSections = new ArrayList<>(); + List<SectionContent> contentSections = new ArrayList<>(); - SectionContent sc = new SectionContent(1); + SectionContent sc = new SectionContent(1); - if (calculateSrcSpans) { - sc.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() - .getStart()), -1)); + if (calculateSrcSpans) { + sc.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst().getStart()), -1)); + } + + // Identify the Line Type and call the necessary Function for the + // further handling... + while (!lineSpans.isEmpty()) { + + Span s = lineSpans.getFirst(); + + lineType t = getLineType(sm, s); + switch (t) { + case SECTION: + contentSections.add(sc); + int level = getSectionLevel(sm, s); + sc = new SectionContent( + parseContentElement(sm, cepp, + new Span(s.getStart() + level, s.getEnd() - level).trim(sm)), + level); + lineSpans.removeFirst(); + + if (calculateSrcSpans) { + sc.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), -1)); + } + + break; + + case HR: + // remove the HR (----) and handle the rest as a parapraph line + removeHr(sm, s); + t = lineType.PARAGRAPH; + case PARAGRAPH: + case PARAGRAPH_BOXED: + case PARAGRAPH_INDENTED: + sc.addParagraph(buildParagraph(sm, cepp, lineSpans, t)); + break; + + case NESTEDLIST: + case NESTEDLIST_NR: + sc.addNestedList(buildNestedList(sm, cepp, lineSpans, t)); + break; + + case DEFINITIONLIST: + sc.addDefinitionList(buildDefinitionList(sm, cepp, lineSpans)); + break; + + case TABLE: + sc.addTable(buildTable(sm, cepp, lineSpans)); + break; + + case EMPTYLINE: + lineSpans.removeFirst(); + break; + + default: + logger.error("unknown lineStart!: \"" + sm.substring(s) + "\""); + lineSpans.removeFirst(); + } + } + + // add the remaining Section to the list. + contentSections.add(sc); + + return buildSectionStructure(contentSections); } - // Identify the Line Type and call the necessary Function for the - // further handling... - while (!lineSpans.isEmpty()) { + private Span removeHr(SpanManager sm, Span s) + { + int start = s.getStart(); + final int end = s.getEnd(); + while (sm.charAt(start) == '-' && start < end) { + start++; + } + return s.setStart(start).trim(sm); + } - Span s = lineSpans.getFirst(); + /** + * The Line Types wich are possible... + */ + private enum lineType + { + SECTION, TABLE, NESTEDLIST, NESTEDLIST_NR, DEFINITIONLIST, HR, PARAGRAPH, + PARAGRAPH_INDENTED, PARAGRAPH_BOXED, EMPTYLINE + } - lineType t = getLineType(sm, s); - switch (t) { - case SECTION: - contentSections.add(sc); - int level = getSectionLevel(sm, s); - sc = new SectionContent(parseContentElement(sm, cepp, new Span( - s.getStart() + level, s.getEnd() - level).trim(sm)), - level); - lineSpans.removeFirst(); + /** + * Retunrns the Type of a line, this is mainly done by the First Char of the Line... + */ + private lineType getLineType(SpanManager sm, Span lineSpan) + { - if (calculateSrcSpans) { - sc.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), -1)); - } + switch (lineSpan.charAt(0, sm)) { - break; + case '{': + if (lineSpan.charAt(1, sm) == '|') { + return lineType.TABLE; + } + else { + return lineType.PARAGRAPH; + } - case HR: - // remove the HR (----) and handle the rest as a parapraph line - removeHr(sm, s); - t = lineType.PARAGRAPH; - case PARAGRAPH: - case PARAGRAPH_BOXED: - case PARAGRAPH_INDENTED: - sc.addParagraph(buildParagraph(sm, cepp, lineSpans, t)); - break; + case '=': + if (lineSpan.length() > 2 && sm.charAt(lineSpan.getEnd() - 1) == '=') { + return lineType.SECTION; + } + else { + return lineType.PARAGRAPH; + } - case NESTEDLIST: - case NESTEDLIST_NR: - sc.addNestedList(buildNestedList(sm, cepp, lineSpans, t)); - break; + case '-': + if (lineSpan.charAt(1, sm) == '-' && lineSpan.charAt(2, sm) == '-' + && lineSpan.charAt(3, sm) == '-') { + return lineType.HR; + } + else { + return lineType.PARAGRAPH; + } - case DEFINITIONLIST: - sc.addDefinitionList(buildDefinitionList(sm, cepp, lineSpans)); - break; + case '*': + return lineType.NESTEDLIST; - case TABLE: - sc.addTable(buildTable(sm, cepp, lineSpans)); - break; + case '#': + return lineType.NESTEDLIST_NR; - case EMPTYLINE: - lineSpans.removeFirst(); - break; + case ';': + return lineType.DEFINITIONLIST; - default: - logger.error("unknown lineStart!: \"" + sm.substring(s) + "\""); - lineSpans.removeFirst(); - } - } + case ':': + if (lineSpan.length() > 1) { + if (lineSpan.length() > 2 && lineSpan.charAt(1, sm) == '{' + && lineSpan.charAt(2, sm) == '|') { + return lineType.TABLE; + } + else { + return lineType.PARAGRAPH_INDENTED; + } + } + else { + return lineType.PARAGRAPH; + } - // add the remaining Section to the list. - contentSections.add(sc); + case ' ': + int nonWSPos = lineSpan.nonWSCharPos(sm); + switch (lineSpan.charAt(nonWSPos, sm)) { + case Span.ERRORCHAR: + return lineType.EMPTYLINE; + case '{': + if (lineSpan.charAt(nonWSPos + 1, sm) == '|') { + return lineType.TABLE; + } + default: + return lineType.PARAGRAPH_BOXED; + } - return buildSectionStructure(contentSections); - } + case Span.ERRORCHAR: + return lineType.EMPTYLINE; - private Span removeHr(SpanManager sm, Span s) { - int start = s.getStart(); - final int end = s.getEnd(); - while (sm.charAt(start) == '-' && start < end) { - start++; + default: + return lineType.PARAGRAPH; + } } - return s.setStart(start).trim(sm); - } - - /** - * The Line Types wich are possible... - */ - private enum lineType { - SECTION, TABLE, NESTEDLIST, NESTEDLIST_NR, DEFINITIONLIST, HR, PARAGRAPH, PARAGRAPH_INDENTED, PARAGRAPH_BOXED, EMPTYLINE - } - - /** - * Retunrns the Type of a line, this is mainly done by the First Char of the - * Line... - */ - private lineType getLineType(SpanManager sm, Span lineSpan) { - - switch (lineSpan.charAt(0, sm)) { - - case '{': - if (lineSpan.charAt(1, sm) == '|') { - return lineType.TABLE; - } else { - return lineType.PARAGRAPH; - } - - case '=': - if (lineSpan.length() > 2 - && sm.charAt(lineSpan.getEnd() - 1) == '=') { - return lineType.SECTION; - } else { - return lineType.PARAGRAPH; - } - - case '-': - if (lineSpan.charAt(1, sm) == '-' && lineSpan.charAt(2, sm) == '-' - && lineSpan.charAt(3, sm) == '-') { - return lineType.HR; - } else { - return lineType.PARAGRAPH; - } - - case '*': - return lineType.NESTEDLIST; - - case '#': - return lineType.NESTEDLIST_NR; - - case ';': - return lineType.DEFINITIONLIST; - - case ':': - if (lineSpan.length() > 1) { - if (lineSpan.length() > 2 && lineSpan.charAt(1, sm) == '{' - && lineSpan.charAt(2, sm) == '|') { - return lineType.TABLE; - } else { - return lineType.PARAGRAPH_INDENTED; - } - } else { - return lineType.PARAGRAPH; - } - - case ' ': - int nonWSPos = lineSpan.nonWSCharPos(sm); - switch (lineSpan.charAt(nonWSPos, sm)) { - case Span.ERRORCHAR: - return lineType.EMPTYLINE; - case '{': - if (lineSpan.charAt(nonWSPos + 1, sm) == '|') { - return lineType.TABLE; + + /** + * Returns the number of Equality Chars which are used to specify the level of the Section. + */ + private int getSectionLevel(SpanManager sm, Span sectionNameSpan) + { + int begin = sectionNameSpan.getStart(); + int end = sectionNameSpan.getEnd(); + int level = 0; + + try { + while ((sm.charAt(begin + level) == '=') && (sm.charAt(end - 1 - level) == '=')) { + level++; } - default: - return lineType.PARAGRAPH_BOXED; + } + catch (StringIndexOutOfBoundsException e) { + // there is no need to do anything! + logger.debug("EXCEPTION IS OK: {}", e.getLocalizedMessage()); } - case Span.ERRORCHAR: - return lineType.EMPTYLINE; + if (begin + level == end) { + level = (level - 1) / 2; + } - default: - return lineType.PARAGRAPH; - } - } - - /** - * Returns the number of Equality Chars which are used to specify the level - * of the Section. - */ - private int getSectionLevel(SpanManager sm, Span sectionNameSpan) { - int begin = sectionNameSpan.getStart(); - int end = sectionNameSpan.getEnd(); - int level = 0; - - try { - while ((sm.charAt(begin + level) == '=') - && (sm.charAt(end - 1 - level) == '=')) { - level++; - } - } catch (StringIndexOutOfBoundsException e) { - // there is no need to do anything! - logger.debug("EXCEPTION IS OK: {}", e.getLocalizedMessage()); - } + return level; + } + + /** + * Takes a list of SectionContent and returns a SectionContainer with the given SectionContent s + * in the right structure. + */ + private SectionContainer buildSectionStructure(List<SectionContent> scl) + { + SectionContainer result = new SectionContainer(0); + + for (SectionContent sContent : scl) { + int contentLevel = sContent.getLevel(); + SectionContainer sContainer = result; + + // get the right SectionContainer or create it + for (int containerLevel = result.getLevel() + + 1; containerLevel < contentLevel; containerLevel++) { + int containerSubSections = sContainer.nrOfSubSections(); + if (containerSubSections != 0) { + Section temp = sContainer.getSubSection(containerSubSections - 1); + if (temp.getClass() == SectionContainer.class) { + sContainer = (SectionContainer) temp; + } + else { + SectionContainer sct = new SectionContainer(temp.getTitleElement(), + containerLevel); + sct.addSection(temp); + if (calculateSrcSpans) { + sct.setSrcSpan(temp.getSrcSpan()); + } + temp.setTitleElement(null); + temp.setLevel(containerLevel + 1); + sContainer.removeSection(temp); + sContainer.addSection(sct); + sContainer = sct; + } + } + else { + sContainer = new SectionContainer(null, containerLevel); + } + } - if (begin + level == end) { - level = (level - 1) / 2; - } + sContainer.addSection(sContent); + } - return level; - } - - /** - * Takes a list of SectionContent and returns a SectionContainer with the - * given SectionContent s in the right structure. - */ - private SectionContainer buildSectionStructure(List<SectionContent> scl) { - SectionContainer result = new SectionContainer(0); - - for (SectionContent sContent : scl) { - int contentLevel = sContent.getLevel(); - SectionContainer sContainer = result; - - // get the right SectionContainer or create it - for (int containerLevel = result.getLevel() + 1; containerLevel < contentLevel; containerLevel++) { - int containerSubSections = sContainer.nrOfSubSections(); - if (containerSubSections != 0) { - Section temp = sContainer - .getSubSection(containerSubSections - 1); - if (temp.getClass() == SectionContainer.class) { - sContainer = (SectionContainer) temp; - } else { - SectionContainer sct = new SectionContainer(temp - .getTitleElement(), containerLevel); - sct.addSection(temp); - if (calculateSrcSpans) { - sct.setSrcSpan(temp.getSrcSpan()); - } - temp.setTitleElement(null); - temp.setLevel(containerLevel + 1); - sContainer.removeSection(temp); - sContainer.addSection(sct); - sContainer = sct; - } - } else { - sContainer = new SectionContainer(null, containerLevel); + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(0, -1)); } - } - sContainer.addSection(sContent); + return result; } - if (calculateSrcSpans) { - result.setSrcSpan(new SrcSpan(0, -1)); + private boolean startsWithIgnoreCase(String s1, String s2) + { + final int s2len = s2.length(); + if (s1.length() < s2len) { + return false; + } + return s1.substring(0, s2len).equalsIgnoreCase(s2); } - return result; - } + private Span getTag(SpanManager sm, int offset) + { + int start = sm.indexOf("<", offset); + if (start == -1) { + return null; + } + int end = sm.indexOf(">", start); + if (end == -1) { + return null; + } - private boolean startsWithIgnoreCase(String s1, String s2) { - final int s2len = s2.length(); - if (s1.length() < s2len) { - return false; + Span s = new Span(start, end + 1); + if (calculateSrcSpans) { + s.setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm.getSrcPos(end) + 1)); + } + return s; + } + + private String getTagText(SpanManager sm, Span tag) + { + return sm.substring(new Span(tag.getStart() + 1, tag.getEnd() - 1).trim(sm)); + } + + private void parseSpecifiedTag(SpanManager sm, List<Span> spans, List<String> strings, + String specifier) + { + parseSpecifiedTag(sm, spans, strings, specifier, ""); + } + + private void parseSpecifiedTag(SpanManager sm, List<Span> spans, List<String> strings, + String specifier, String prefix) + { + int offset = 0; + + Span s; + while ((s = getTag(sm, offset)) != null) { + offset = s.getEnd(); + String tagText = getTagText(sm, s); + if (startsWithIgnoreCase(tagText, specifier)) { + + Span e; + while ((e = getTag(sm, offset)) != null) { + offset = e.getEnd(); + tagText = getTagText(sm, e); + if (startsWithIgnoreCase(tagText, "/" + specifier)) { + break; + } + } + + if (e == null) { + /* + * OF: Setting e to sm.length()results in ArrayIndexOutOfBoundsExeption if + * calculateSrcSpans=true + */ + // e = new Span(sm.length(), sm.length()); + e = new Span(Math.max(0, sm.length() - 1), Math.max(0, sm.length() - 1)); + } + + strings.add(sm.substring(s.getEnd(), e.getStart())); + + Span tSpan = new Span(s.getStart(), e.getEnd()); + if (calculateSrcSpans) { + tSpan.setSrcSpan( + new SrcSpan(sm.getSrcPos(s.getStart()), sm.getSrcPos(e.getEnd()))); + } + + spans.add(tSpan); + sm.replace(tSpan, prefix + "(" + specifier + ")"); + tSpan.adjustStart(prefix.length()); + + offset = tSpan.getEnd(); + } + } } - return s1.substring(0, s2len).equalsIgnoreCase(s2); - } - private Span getTag(SpanManager sm, int offset) { - int start = sm.indexOf("<", offset); - if (start == -1) { - return null; - } - int end = sm.indexOf(">", start); - if (end == -1) { - return null; - } + private void parseTags(SpanManager sm, List<Span> spans) + { + sm.manageList(spans); - Span s = new Span(start, end + 1); - if (calculateSrcSpans) { - s - .setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm - .getSrcPos(end) + 1)); - } - return s; - } - - private String getTagText(SpanManager sm, Span tag) { - return sm.substring(new Span(tag.getStart() + 1, tag.getEnd() - 1) - .trim(sm)); - } - - private void parseSpecifiedTag(SpanManager sm, List<Span> spans, - List<String> strings, String specifier) { - parseSpecifiedTag(sm, spans, strings, specifier, ""); - } - - private void parseSpecifiedTag(SpanManager sm, List<Span> spans, - List<String> strings, String specifier, String prefix) { - int offset = 0; - - Span s; - while ((s = getTag(sm, offset)) != null) { - offset = s.getEnd(); - String tagText = getTagText(sm, s); - if (startsWithIgnoreCase(tagText, specifier)) { - - Span e; - while ((e = getTag(sm, offset)) != null) { - offset = e.getEnd(); - tagText = getTagText(sm, e); - if (startsWithIgnoreCase(tagText, "/" + specifier)) { - break; - } + Span s = new Span(0, 0); + while ((s = getTag(sm, s.getEnd())) != null) { + spans.add(s); } - if (e == null) { - /* - * OF: Setting e to sm.length()results in ArrayIndexOutOfBoundsExeption if calculateSrcSpans=true - */ - //e = new Span(sm.length(), sm.length()); - e = new Span(Math.max(0, sm.length() - 1), Math.max(0, sm.length() - 1)); + if (spans.size() == 0) { + sm.removeManagedList(spans); } + } - strings.add(sm.substring(s.getEnd(), e.getStart())); + private void parseTemplates(SpanManager sm, List<Span> resolvedTemplateSpans, + List<ResolvedTemplate> resolvedTemplates, ParsedPage pp) + { - Span tSpan = new Span(s.getStart(), e.getEnd()); - if (calculateSrcSpans) { - tSpan.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), sm - .getSrcPos(e.getEnd()))); + sm.manageList(resolvedTemplateSpans); + + int pos = -2; + Stack<Integer> templateOpenTags = new Stack<>(); + while ((pos = sm.indexOf("{{", pos + 2)) != -1) { + if (sm.length() > pos + 3 && sm.charAt(pos + 2) == '{' && sm.charAt(pos + 3) != '{') { + pos++; + } + templateOpenTags.push(pos); } - spans.add(tSpan); - sm.replace(tSpan, prefix + "(" + specifier + ")"); - tSpan.adjustStart(prefix.length()); + while (!templateOpenTags.empty()) { + int templateOpenTag = templateOpenTags.pop(); + int templateCloseTag = sm.indexOf("}}", templateOpenTag); + if (templateCloseTag == -1) { + continue; + } - offset = tSpan.getEnd(); - } - } - } + int templateOptionTag = sm.indexOf("|", templateOpenTag, templateCloseTag); + int templateNameEnd; + List<String> templateOptions; + + if (templateOptionTag != -1) { + templateNameEnd = templateOptionTag; + templateOptions = tokenize(sm, templateOptionTag + 1, templateCloseTag, "|"); + } + else { + templateNameEnd = templateCloseTag; + templateOptions = new ArrayList<>(); + } - private void parseTags(SpanManager sm, List<Span> spans) { - sm.manageList(spans); + Span ts = new Span(templateOpenTag, templateCloseTag + 2); - Span s = new Span(0, 0); - while ((s = getTag(sm, s.getEnd())) != null) { - spans.add(s); - } + Template t = new Template(ts, + encodeWikistyle(sm.substring(templateOpenTag + 2, templateNameEnd).trim()), + templateOptions); + + if (calculateSrcSpans) { + t.setSrcSpan(new SrcSpan(sm.getSrcPos(templateOpenTag), + sm.getSrcPos(templateCloseTag + 2))); + } - if (spans.size() == 0) { - sm.removeManagedList(spans); + t.setPos(ts); + + ResolvedTemplate rt = templateParser.parseTemplate(t, pp); + + resolvedTemplateSpans.add(ts); + resolvedTemplates.add(rt); + + sm.replace(ts, rt.getPreParseReplacement()); + } + + if (resolvedTemplateSpans.isEmpty()) { + sm.removeManagedList(resolvedTemplateSpans); + } } - } - - private void parseTemplates(SpanManager sm, - List<Span> resolvedTemplateSpans, - List<ResolvedTemplate> resolvedTemplates, ParsedPage pp) { - - sm.manageList(resolvedTemplateSpans); - - int pos = -2; - Stack<Integer> templateOpenTags = new Stack<>(); - while ((pos = sm.indexOf("{{", pos + 2)) != -1) { - if (sm.length() > pos + 3 && sm.charAt(pos + 2) == '{' - && sm.charAt(pos + 3) != '{') { - pos++; - } - templateOpenTags.push(pos); + + private void convertGalleriesToImages(SpanManager sm, List<Span> tagSpans) + { + // Quick Hack, not very efficent, should be improved, wont work with + // calculateSrcSpans == true ! + + for (int i = 0; i < tagSpans.size() - 1; i++) { + String openText = getTagText(sm, tagSpans.get(i)); + if (startsWithIgnoreCase(openText, "GALLERY")) { + + if (startsWithIgnoreCase(getTagText(sm, tagSpans.get(i + 1)), "/GALLERY")) { + + // gallery range is tag(i).end() .. tag(i+1).start() + Span startSpan = tagSpans.remove(i); + Span endSpan = tagSpans.remove(i); + i--; + + StringBuilder sb = new StringBuilder(); + + // caption (any option will be treated as caption) + int eqPos = openText.indexOf('='); + if (eqPos != -1) { + int captionStart = eqPos + 1; + int captionEnd = openText.length(); + + if (captionStart < captionEnd && openText.charAt(captionStart) == '"' + && openText.charAt(captionEnd - 1) == '"') { + captionStart++; + captionEnd--; + } + + if (captionStart < captionEnd) { + sb.append(openText.substring(captionStart, captionEnd) + lineSeparator); + } + } + + // images + for (String s : tokenize(sm, startSpan.getEnd(), endSpan.getStart(), + lineSeparator)) { + sb.append("[[" + s + "]]" + lineSeparator); + } + + // replace the source and remove the tags + sm.replace(startSpan.getStart(), endSpan.getEnd(), sb.toString()); + } + else { + continue; + } + } + } } - while (!templateOpenTags.empty()) { - int templateOpenTag = templateOpenTags.pop(); - int templateCloseTag = sm.indexOf("}}", templateOpenTag); - if (templateCloseTag == -1) { - continue; - } + private Table buildTable(SpanManager sm, ContentElementParsingParameters cepp, + LinkedList<Span> lineSpans) + { - int templateOptionTag = sm.indexOf("|", templateOpenTag, - templateCloseTag); - int templateNameEnd; - List<String> templateOptions; + Table result = new Table(); + int col = -1; + int row = 0; + int subTables = 0; + LinkedList<Span> tableDataSpans = new LinkedList<>(); + sm.manageList(tableDataSpans); - if (templateOptionTag != -1) { - templateNameEnd = templateOptionTag; - templateOptions = tokenize(sm, templateOptionTag + 1, - templateCloseTag, "|"); - } else { - templateNameEnd = templateCloseTag; - templateOptions = new ArrayList<>(); - } + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst().getStart()), -1)); + } + + lineSpans.removeFirst(); + + while (!lineSpans.isEmpty()) { + Span s = lineSpans.removeFirst(); + + int pos = s.nonWSCharPos(sm); + char c0 = s.charAt(pos, sm); + char c1 = s.charAt(pos + 1, sm); + + if (subTables == 0 && (c0 == '!' || c0 == '|')) { + if (!tableDataSpans.isEmpty()) { + lineSpans.addFirst(s); + + SrcSpan ei = null; + if (calculateSrcSpans) { + ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst().getStart() - 1) + 1, + -1); + } + + TableElement te = new TableElement(parseSections(sm, cepp, tableDataSpans), row, + col); + te.setSrcSpan(ei); + result.addTableElement(te); + lineSpans.removeFirst(); + } + + col++; + if (c1 == '-') { + row++; + col = -1; + continue; + } + else if (c0 == '|' && c1 == '}') { + sm.removeManagedList(tableDataSpans); + + if (calculateSrcSpans) { + result.getSrcSpan().setEnd(sm.getSrcPos(s.getEnd())); + } + + return result; + } + else if (c0 == '|' && c1 == '+') { + result.setTitleElement(parseContentElement(sm, cepp, + new Span(s.getStart() + pos + 2, s.getEnd()).trim(sm))); + continue; + } + else { + int multipleCols; + if ((multipleCols = sm.indexOf("||", s.getStart() + pos + 1, + s.getEnd())) != -1) { + lineSpans.addFirst(new Span(multipleCols + 1, s.getEnd())); + s.setEnd(multipleCols); + } + + int optionTagPos = sm.indexOf("|", s.getStart() + pos + 1, s.getEnd()); + + if (optionTagPos != -1) { + s.setStart(optionTagPos + 1).trim(sm); + } + else { + s.adjustStart(pos + 1).trim(sm); + } + } + } + else if (c0 == '|' && c1 == '}') { + subTables--; + } + else if (c0 == '{' && c1 == '|') { + subTables++; + } + + tableDataSpans.addLast(s); + } - Span ts = new Span(templateOpenTag, templateCloseTag + 2); + if (tableDataSpans.size() != 0) { - Template t = new Template(ts, encodeWikistyle(sm.substring( - templateOpenTag + 2, templateNameEnd).trim()), - templateOptions); + SrcSpan ei = null; + if (calculateSrcSpans) { + ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst().getStart() - 1) + 1, -1); + } - if (calculateSrcSpans) { - t.setSrcSpan(new SrcSpan(sm.getSrcPos(templateOpenTag), sm - .getSrcPos(templateCloseTag + 2))); - } + TableElement te = new TableElement(parseSections(sm, cepp, tableDataSpans), row, col); + te.setSrcSpan(ei); - t.setPos(ts); + result.addTableElement(te); + } - ResolvedTemplate rt = templateParser.parseTemplate(t, pp); + sm.removeManagedList(tableDataSpans); - resolvedTemplateSpans.add(ts); - resolvedTemplates.add(rt); + if (calculateSrcSpans) { + result.getSrcSpan().setEnd(-1); + } - sm.replace(ts, rt.getPreParseReplacement()); + return result; } - if (resolvedTemplateSpans.isEmpty()) { - sm.removeManagedList(resolvedTemplateSpans); + private NestedListContainer buildNestedList(SpanManager sm, + ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, lineType listType) + { + + boolean numbered = listType == lineType.NESTEDLIST_NR; + NestedListContainer result = new NestedListContainer(numbered); + + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst().getStart()), -1)); + } + + LinkedList<Span> nestedListSpans = new LinkedList<>(); + while (!lineSpans.isEmpty()) { + Span s = lineSpans.getFirst(); + if (listType != getLineType(sm, s)) { + break; + } + nestedListSpans.add(new Span(s.getStart() + 1, s.getEnd()).trim(sm)); + lineSpans.removeFirst(); + } + sm.manageList(nestedListSpans); + + if (calculateSrcSpans) { + result.getSrcSpan().setEnd(sm.getSrcPos(nestedListSpans.getLast().getEnd())); + } + + while (!nestedListSpans.isEmpty()) { + Span s = nestedListSpans.getFirst(); + lineType t = getLineType(sm, s); + if (t == lineType.NESTEDLIST || t == lineType.NESTEDLIST_NR) { + result.add(buildNestedList(sm, cepp, nestedListSpans, t)); + } + else { + nestedListSpans.removeFirst(); + result.add((NestedListElement) parseContentElement(sm, cepp, s, + new NestedListElement())); + } + } + + sm.removeManagedList(nestedListSpans); + + return result; } - } - private void convertGalleriesToImages(SpanManager sm, List<Span> tagSpans) { - // Quick Hack, not very efficent, should be improved, wont work with - // calculateSrcSpans == true ! + private DefinitionList buildDefinitionList(SpanManager sm, ContentElementParsingParameters cepp, + LinkedList<Span> lineSpans) + { + List<ContentElement> content = new ArrayList<>(); + + Span s = lineSpans.removeFirst(); + + int temp = sm.indexOf(":", s); + if (temp == -1) { + content.add(parseContentElement(sm, cepp, new Span(s.getStart() + 1, s.getEnd()))); + } + else { + content.add(parseContentElement(sm, cepp, new Span(temp + 1, s.getEnd()))); + content.add(0, parseContentElement(sm, cepp, new Span(s.getStart() + 1, temp))); + } + + while (!lineSpans.isEmpty()) { + Span ns = lineSpans.getFirst(); + if (sm.charAt(ns.getStart()) != ':') { + break; + } + lineSpans.removeFirst(); + content.add(parseContentElement(sm, cepp, new Span(ns.getStart() + 1, ns.getEnd()))); + } - for (int i = 0; i < tagSpans.size() - 1; i++) { - String openText = getTagText(sm, tagSpans.get(i)); - if (startsWithIgnoreCase(openText, "GALLERY")) { + DefinitionList result = new DefinitionList(content); - if (startsWithIgnoreCase(getTagText(sm, tagSpans.get(i + 1)), - "/GALLERY")) { + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), + content.get(content.size() - 1).getSrcSpan().getEnd())); + } - // gallery range is tag(i).end() .. tag(i+1).start() - Span startSpan = tagSpans.remove(i); - Span endSpan = tagSpans.remove(i); - i--; + return result; + } - StringBuilder sb = new StringBuilder(); + private Paragraph buildParagraph(SpanManager sm, ContentElementParsingParameters cepp, + LinkedList<Span> lineSpans, lineType paragraphType) + { - // caption (any option will be treated as caption) - int eqPos = openText.indexOf('='); - if (eqPos != -1) { - int captionStart = eqPos + 1; - int captionEnd = openText.length(); + LinkedList<Span> paragraphSpans = new LinkedList<>(); + Paragraph result = new Paragraph(); + Span s = lineSpans.removeFirst(); + paragraphSpans.add(s); - if (captionStart < captionEnd - && openText.charAt(captionStart) == '"' - && openText.charAt(captionEnd - 1) == '"') { - captionStart++; - captionEnd--; + switch (paragraphType) { + case PARAGRAPH: + result.setType(Paragraph.type.NORMAL); + while (!lineSpans.isEmpty()) { + if (paragraphType != getLineType(sm, lineSpans.getFirst())) { + break; + } + paragraphSpans.add(lineSpans.removeFirst()); } + break; - if (captionStart < captionEnd) { - sb.append(openText.substring(captionStart, - captionEnd) - + lineSeparator); + case PARAGRAPH_BOXED: + result.setType(Paragraph.type.BOXED); + while (!lineSpans.isEmpty()) { + lineType lt = getLineType(sm, lineSpans.getFirst()); + if (paragraphType != lt && lineType.EMPTYLINE != lt) { + break; + } + paragraphSpans.add(lineSpans.removeFirst()); } - } + break; - // images - for (String s : tokenize(sm, startSpan.getEnd(), endSpan - .getStart(), lineSeparator)) { - sb.append("[[" + s + "]]" + lineSeparator); - } + case PARAGRAPH_INDENTED: + result.setType(Paragraph.type.INDENTED); + s.trim(sm.setCharAt(s.getStart(), ' ')); + break; - // replace the source and remove the tags - sm.replace(startSpan.getStart(), endSpan.getEnd(), sb - .toString()); - } else { - continue; + default: + return null; } - } + + parseContentElement(sm, cepp, paragraphSpans, result); + + return result; } - } - private Table buildTable(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) { + private List<String> tokenize(SpanManager sm, int start, int end, String delim) + { + List<String> result = new ArrayList<>(); - Table result = new Table(); - int col = -1; - int row = 0; - int subTables = 0; - LinkedList<Span> tableDataSpans = new LinkedList<>(); - sm.manageList(tableDataSpans); + if (start > end) { + logger.debug("tokenize({},{}) doesn't make sense", start, end); + return result; + } - if (calculateSrcSpans) { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() - .getStart()), -1)); - } + int s = start; + int e; + String token; + // Span rs; + while ((e = sm.indexOf(delim, s, end)) != -1) { + // rs = new Span(s, e).trim( sm ); + // if( rs.length()>0 ) result.add( sm.substring( rs ) ); + token = sm.substring(s, e).trim(); + if (token.length() > 0) { + result.add(token); + } + s = e + delim.length(); + } + // rs = new Span(s, end).trim( sm ); + // if( rs.length()>0 ) result.add( sm.substring( rs ) ); + token = sm.substring(s, end).trim(); + if (token.length() > 0) { + result.add(token); + } - lineSpans.removeFirst(); - - while (!lineSpans.isEmpty()) { - Span s = lineSpans.removeFirst(); - - int pos = s.nonWSCharPos(sm); - char c0 = s.charAt(pos, sm); - char c1 = s.charAt(pos + 1, sm); - - if (subTables == 0 && (c0 == '!' || c0 == '|')) { - if (!tableDataSpans.isEmpty()) { - lineSpans.addFirst(s); - - SrcSpan ei = null; - if (calculateSrcSpans) { - ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst() - .getStart() - 1) + 1, -1); - } - - TableElement te = new TableElement(parseSections(sm, cepp, - tableDataSpans), row, col); - te.setSrcSpan(ei); - result.addTableElement(te); - lineSpans.removeFirst(); - } - - col++; - if (c1 == '-') { - row++; - col = -1; - continue; - } else if (c0 == '|' && c1 == '}') { - sm.removeManagedList(tableDataSpans); - - if (calculateSrcSpans) { - result.getSrcSpan().setEnd(sm.getSrcPos(s.getEnd())); - } - - return result; - } else if (c0 == '|' && c1 == '+') { - result.setTitleElement(parseContentElement(sm, cepp, - new Span(s.getStart() + pos + 2, s.getEnd()) - .trim(sm))); - continue; - } else { - int multipleCols; - if ((multipleCols = sm.indexOf("||", - s.getStart() + pos + 1, s.getEnd())) != -1) { - lineSpans.addFirst(new Span(multipleCols + 1, s - .getEnd())); - s.setEnd(multipleCols); - } - - int optionTagPos = sm.indexOf("|", s.getStart() + pos + 1, - s.getEnd()); - - if (optionTagPos != -1) { - s.setStart(optionTagPos + 1).trim(sm); - } else { - s.adjustStart(pos + 1).trim(sm); - } - } - } else if (c0 == '|' && c1 == '}') { - subTables--; - } else if (c0 == '{' && c1 == '|') { - subTables++; - } - - tableDataSpans.addLast(s); + return result; } - if (tableDataSpans.size() != 0) { + private void parseExternalLinks(SpanManager sm, Span s, String protocol, List<Span> managedList, + List<Link> links, Content home_cc) + { + int extLinkTargetStart; + Span extLinkSpan = new Span(0, s.getStart()); - SrcSpan ei = null; - if (calculateSrcSpans) { - ei = new SrcSpan(sm.getSrcPos(tableDataSpans.getFirst() - .getStart() - 1) + 1, -1); - } + while ((extLinkTargetStart = sm.indexOf(protocol, extLinkSpan.getEnd(), + s.getEnd())) != -1) { - TableElement te = new TableElement(parseSections(sm, cepp, - tableDataSpans), row, col); - te.setSrcSpan(ei); + // Allowed char before the protocol identifer ? + if (extLinkTargetStart > s.getStart() + && (" [").indexOf(sm.charAt(extLinkTargetStart - 1)) == -1) { + extLinkSpan = new Span(0, extLinkTargetStart + 1); + continue; + } - result.addTableElement(te); - } + // Target + int extLinkTargetEnd = extLinkTargetStart; + while ((lineSeparator + " ]").indexOf(sm.charAt(extLinkTargetEnd)) == -1) { + extLinkTargetEnd++; + } - sm.removeManagedList(tableDataSpans); + // Open/Close Tags + int extLinkOpenTag = extLinkTargetStart - 1; + int extLinkCloseTag; + int extLinkTextStart = extLinkTargetStart; + int extLinkTextEnd = extLinkTargetEnd; - if (calculateSrcSpans) { - result.getSrcSpan().setEnd(-1); - } + while (extLinkOpenTag >= s.getStart() && sm.charAt(extLinkOpenTag) == ' ') { + extLinkOpenTag--; + } - return result; - } + if (extLinkOpenTag >= s.getStart() && sm.charAt(extLinkOpenTag) == '[') { + extLinkCloseTag = sm.indexOf("]", extLinkTargetEnd, s.getEnd()); + + if (extLinkCloseTag != -1) { + extLinkTextStart = extLinkTargetEnd; + // nicht wie bei "normalen" links durhc | getrennt sondenr + // durhc leerzeichen !!! schei�e !!! + while (sm.charAt(extLinkTextStart) == ' ') { + extLinkTextStart++; + } + extLinkTextEnd = extLinkCloseTag; + extLinkCloseTag++; + + if (extLinkTextStart == extLinkTextEnd) { + sm.insert(extLinkTextStart, "[ ]"); + extLinkTextEnd += 3; + extLinkCloseTag += 3; + } + } + else { + extLinkOpenTag = extLinkTargetStart; + extLinkCloseTag = extLinkTargetEnd; + } + } + else { + extLinkOpenTag = extLinkTargetStart; + extLinkCloseTag = extLinkTargetEnd; + } - private NestedListContainer buildNestedList(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, - lineType listType) { + extLinkSpan = new Span(extLinkOpenTag, extLinkCloseTag); + managedList.add(extLinkSpan); - boolean numbered = listType == lineType.NESTEDLIST_NR; - NestedListContainer result = new NestedListContainer(numbered); + Link l = new Link(home_cc, extLinkSpan, + sm.substring(extLinkTargetStart, extLinkTargetEnd), Link.type.EXTERNAL, null); + links.add(l); - if (calculateSrcSpans) { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(lineSpans.getFirst() - .getStart()), -1)); - } + if (calculateSrcSpans) { + l.setSrcSpan(new SrcSpan(sm.getSrcPos(extLinkOpenTag), + sm.getSrcPos(extLinkCloseTag - 1) + 1)); + } - LinkedList<Span> nestedListSpans = new LinkedList<>(); - while (!lineSpans.isEmpty()) { - Span s = lineSpans.getFirst(); - if (listType != getLineType(sm, s)) { - break; - } - nestedListSpans - .add(new Span(s.getStart() + 1, s.getEnd()).trim(sm)); - lineSpans.removeFirst(); + sm.delete(extLinkTextEnd, extLinkCloseTag); + sm.delete(extLinkOpenTag, extLinkTextStart); + } } - sm.manageList(nestedListSpans); - if (calculateSrcSpans) { - result.getSrcSpan().setEnd( - sm.getSrcPos(nestedListSpans.getLast().getEnd())); + /** + * Returns the LOWERCASE NameSpace of the link target + */ + private static String getLinkNameSpace(String target) + { + int pos = target.indexOf(':'); + if (pos == -1) { + return null; + } + else { + return target.substring(0, pos).replace('_', ' ').trim().toLowerCase(); + } } - while (!nestedListSpans.isEmpty()) { - Span s = nestedListSpans.getFirst(); - lineType t = getLineType(sm, s); - if (t == lineType.NESTEDLIST || t == lineType.NESTEDLIST_NR) { - result.add(buildNestedList(sm, cepp, nestedListSpans, t)); - } else { - nestedListSpans.removeFirst(); - result.add((NestedListElement) parseContentElement(sm, cepp, s, - new NestedListElement())); - } - } + /** + * There is not much differences between links an images, so they are parsed in a single step + */ + private void parseImagesAndInternalLinks(SpanManager sm, List<Span> linkSpans, List<Link> links) + { - sm.removeManagedList(nestedListSpans); + sm.manageList(linkSpans); - return result; - } + int pos = -1; + Stack<Integer> linkOpenTags = new Stack<>(); + while ((pos = sm.indexOf("[[", pos + 1)) != -1) { + linkOpenTags.push(pos); + } - private DefinitionList buildDefinitionList(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans) { - List<ContentElement> content = new ArrayList<>(); + Span lastLinkSpan = new Span(sm.length() + 1, sm.length() + 1); + Link.type linkType = Link.type.INTERNAL; - Span s = lineSpans.removeFirst(); + while (!linkOpenTags.empty()) { + int linkStartTag = linkOpenTags.pop(); + int linkEndTag = sm.indexOf("]]", linkStartTag); + if (linkEndTag == -1) { + continue; + } - int temp = sm.indexOf(":", s); - if (temp == -1) { - content.add(parseContentElement(sm, cepp, new Span( - s.getStart() + 1, s.getEnd()))); - } else { - content.add(parseContentElement(sm, cepp, new Span(temp + 1, s - .getEnd()))); - content.add(0, parseContentElement(sm, cepp, new Span( - s.getStart() + 1, temp))); - } + int linkOptionTag = sm.indexOf("|", linkStartTag, linkEndTag); - while (!lineSpans.isEmpty()) { - Span ns = lineSpans.getFirst(); - if (sm.charAt(ns.getStart()) != ':') { - break; - } - lineSpans.removeFirst(); - content.add(parseContentElement(sm, cepp, new Span( - ns.getStart() + 1, ns.getEnd()))); - } + int linkTextStart; + String linkTarget; - DefinitionList result = new DefinitionList(content); + if (linkOptionTag != -1) { + linkTextStart = linkOptionTag + 1; + linkTarget = sm.substring(new Span(linkStartTag + 2, linkOptionTag).trim(sm)); + } + else { + linkTextStart = linkStartTag + 2; + linkTarget = sm.substring(new Span(linkStartTag + 2, linkEndTag).trim(sm)); + } - if (calculateSrcSpans) { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(s.getStart()), content - .get(content.size() - 1).getSrcSpan().getEnd())); - } + // is is a regular link ? + if (linkTarget.contains(lineSeparator)) { + continue; + } + linkTarget = encodeWikistyle(linkTarget); + + // so it is a Link or image!!! + List<String> parameters; + + String namespace = getLinkNameSpace(linkTarget); + if (namespace != null) { + if (imageIdentifers.indexOf(namespace) != -1) { + if (linkOptionTag != -1) { + int temp; + while ((temp = sm.indexOf("|", linkTextStart, linkEndTag)) != -1) { + linkTextStart = temp + 1; + } + + parameters = tokenize(sm, linkOptionTag + 1, linkEndTag, "|"); + + // maybe there is an external link at the end of the + // image description... + if (sm.charAt(linkEndTag + 2) == ']' + && sm.indexOf("[", linkTextStart, linkEndTag) != -1) { + linkEndTag++; + } + } + else { + parameters = null; + } + linkType = Link.type.IMAGE; + } + else { + // Link has namespace but is not image + linkType = Link.type.UNKNOWN; + parameters = null; + } + } + else { + if (linkType == Link.type.INTERNAL + && lastLinkSpan.hits(new Span(linkStartTag, linkEndTag + 2))) { + continue; + } + parameters = null; + linkType = Link.type.INTERNAL; + } - return result; - } + Span posSpan = new Span(linkTextStart, linkEndTag).trim(sm); + linkSpans.add(posSpan); - private Paragraph buildParagraph(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, - lineType paragraphType) { + Link l = new Link(null, posSpan, linkTarget, linkType, parameters); + links.add(l); - LinkedList<Span> paragraphSpans = new LinkedList<>(); - Paragraph result = new Paragraph(); - Span s = lineSpans.removeFirst(); - paragraphSpans.add(s); + if (calculateSrcSpans) { + l.setSrcSpan(new SrcSpan(sm.getSrcPos(linkStartTag), sm.getSrcPos(linkEndTag + 2))); + } - switch (paragraphType) { - case PARAGRAPH: - result.setType(Paragraph.type.NORMAL); - while (!lineSpans.isEmpty()) { - if (paragraphType != getLineType(sm, lineSpans.getFirst())) { - break; - } - paragraphSpans.add(lineSpans.removeFirst()); - } - break; + sm.delete(posSpan.getEnd(), linkEndTag + 2); + sm.delete(linkStartTag, posSpan.getStart()); - case PARAGRAPH_BOXED: - result.setType(Paragraph.type.BOXED); - while (!lineSpans.isEmpty()) { - lineType lt = getLineType(sm, lineSpans.getFirst()); - if (paragraphType != lt && lineType.EMPTYLINE != lt) { - break; - } - paragraphSpans.add(lineSpans.removeFirst()); + // removing line separators in link text + int lsinlink; + while ((lsinlink = sm.indexOf(lineSeparator, posSpan)) != -1) { + sm.replace(lsinlink, lsinlink + lineSeparator.length(), " "); + } + + lastLinkSpan = posSpan; } - break; + } + + /** + * Searches the Range given by the Span s for the double occurence of "quotation" and puts the + * results in the List quotedSpans. The Quotation tags will be deleted. + * + * @param sm + * , the Source in which will be searched + * @param s + * , the range in which will be searched + * @param quotedSpans + * , the List where the Spans will be placed, should be managed by the SpanManager sm + * @param quotation + * , the start and end tag as String + */ + private void parseQuotedSpans(SpanManager sm, Span s, List<Span> quotedSpans, String quotation) + { + + final int qlen = quotation.length(); + + // get the start position + int start = sm.indexOf(quotation, s.getStart(), s.getEnd()); + + while (start != -1) { + + // get the end position + int end = sm.indexOf(quotation, start + qlen, s.getEnd()); + if (end == -1) { + break; + } - case PARAGRAPH_INDENTED: - result.setType(Paragraph.type.INDENTED); - s.trim(sm.setCharAt(s.getStart(), ' ')); - break; + // build a new span from start and end position. + Span qs = new Span(start, end); + quotedSpans.add(qs); - default: - return null; + // calculate the original src positions. + if (calculateSrcSpans) { + qs.setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm.getSrcPos(end + qlen - 1) + 1)); + } + + // delete the tags. + sm.delete(end, end + qlen); + sm.delete(start, start + qlen); + + // get the next start position + start = sm.indexOf(quotation, qs.getEnd(), s.getEnd()); + } } - parseContentElement(sm, cepp, paragraphSpans, result); + /** + * Searches a line for Bold and Italic quotations, this has to be done linewhise. + */ + private void parseBoldAndItalicSpans(SpanManager sm, Span line, List<Span> boldSpans, + List<Span> italicSpans) + { + // Das suchen nach BOLD und ITALIC muss in den Jeweiligen + // Zeilen geschenhen, da ein LineSeparator immer BOLD und + // Italic Tags schliesst. - return result; - } + // Bold Spans + parseQuotedSpans(sm, line, boldSpans, "'''"); - private List<String> tokenize(SpanManager sm, int start, int end, - String delim) { - List<String> result = new ArrayList<>(); + // Italic Spans + parseQuotedSpans(sm, line, italicSpans, "''"); - if (start > end) { - logger.debug("tokenize({},{}) doesn't make sense", start, end); - return result; - } + // Maybe there is ONE SINGLE OPEN TAG left... handel these... + int openTag = sm.indexOf("''", line); + if (openTag != -1) { + // build a Span from this Tag. + Span qs = new Span(openTag, line.getEnd()); - int s = start; - int e; - String token; - // Span rs; - while ((e = sm.indexOf(delim, s, end)) != -1) { - // rs = new Span(s, e).trim( sm ); - // if( rs.length()>0 ) result.add( sm.substring( rs ) ); - token = sm.substring(s, e).trim(); - if (token.length() > 0) { - result.add(token); - } - s = e + delim.length(); - } - // rs = new Span(s, end).trim( sm ); - // if( rs.length()>0 ) result.add( sm.substring( rs ) ); - token = sm.substring(s, end).trim(); - if (token.length() > 0) { - result.add(token); - } + // calculate the original src positions. + if (calculateSrcSpans) { + qs.setSrcSpan(new SrcSpan(sm.getSrcPos(openTag), sm.getSrcPos(line.getEnd()))); + } - return result; - } - - private void parseExternalLinks(SpanManager sm, Span s, String protocol, - List<Span> managedList, List<Link> links, Content home_cc) { - int extLinkTargetStart; - Span extLinkSpan = new Span(0, s.getStart()); - - while ((extLinkTargetStart = sm.indexOf(protocol, extLinkSpan.getEnd(), - s.getEnd())) != -1) { - - // Allowed char before the protocol identifer ? - if (extLinkTargetStart > s.getStart() - && (" [").indexOf(sm.charAt(extLinkTargetStart - 1)) == -1) { - extLinkSpan = new Span(0, extLinkTargetStart + 1); - continue; - } - - // Target - int extLinkTargetEnd = extLinkTargetStart; - while ((lineSeparator + " ]").indexOf(sm.charAt(extLinkTargetEnd)) == -1) { - extLinkTargetEnd++; - } - - // Open/Close Tags - int extLinkOpenTag = extLinkTargetStart - 1; - int extLinkCloseTag; - int extLinkTextStart = extLinkTargetStart; - int extLinkTextEnd = extLinkTargetEnd; - - while (extLinkOpenTag >= s.getStart() - && sm.charAt(extLinkOpenTag) == ' ') { - extLinkOpenTag--; - } - - if (extLinkOpenTag >= s.getStart() - && sm.charAt(extLinkOpenTag) == '[') { - extLinkCloseTag = sm.indexOf("]", extLinkTargetEnd, s.getEnd()); - - if (extLinkCloseTag != -1) { - extLinkTextStart = extLinkTargetEnd; - // nicht wie bei "normalen" links durhc | getrennt sondenr - // durhc leerzeichen !!! schei�e !!! - while (sm.charAt(extLinkTextStart) == ' ') { - extLinkTextStart++; - } - extLinkTextEnd = extLinkCloseTag; - extLinkCloseTag++; - - if (extLinkTextStart == extLinkTextEnd) { - sm.insert(extLinkTextStart, "[ ]"); - extLinkTextEnd += 3; - extLinkCloseTag += 3; - } - } else { - extLinkOpenTag = extLinkTargetStart; - extLinkCloseTag = extLinkTargetEnd; - } - } else { - extLinkOpenTag = extLinkTargetStart; - extLinkCloseTag = extLinkTargetEnd; - } - - extLinkSpan = new Span(extLinkOpenTag, extLinkCloseTag); - managedList.add(extLinkSpan); - - Link l = new Link(home_cc, extLinkSpan, sm.substring( - extLinkTargetStart, extLinkTargetEnd), Link.type.EXTERNAL, - null); - links.add(l); - - if (calculateSrcSpans) { - l.setSrcSpan(new SrcSpan(sm.getSrcPos(extLinkOpenTag), sm - .getSrcPos(extLinkCloseTag - 1) + 1)); - } - - sm.delete(extLinkTextEnd, extLinkCloseTag); - sm.delete(extLinkOpenTag, extLinkTextStart); + // is it a Bold or an Italic tag ? + if (sm.indexOf("'''", openTag, openTag + 3) != -1) { + // --> BOLD + boldSpans.add(qs); + sm.delete(openTag, openTag + 3); + } + else { + // --> ITALIC + italicSpans.add(qs); + sm.delete(openTag, openTag + 2); + } + } } - } - - /** - * Returns the LOWERCASE NameSpace of the link target - */ - private static String getLinkNameSpace(String target) { - int pos = target.indexOf(':'); - if (pos == -1) { - return null; - } else { - return target.substring(0, pos).replace('_', ' ').trim() - .toLowerCase(); + + private static String encodeWikistyle(String str) + { + return str.replace(' ', '_'); } - } - /** - * There is not much differences between links an images, so they are parsed - * in a single step - */ - private void parseImagesAndInternalLinks(SpanManager sm, - List<Span> linkSpans, List<Link> links) { + /** + * Building a ContentElement from a String + */ + @Override + public ContentElement parseContentElement(String src) + { + SpanManager sm = new SpanManager(src); + ContentElementParsingParameters cepp = new ContentElementParsingParameters(); - sm.manageList(linkSpans); + parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); - int pos = -1; - Stack<Integer> linkOpenTags = new Stack<>(); - while ((pos = sm.indexOf("[[", pos + 1)) != -1) { - linkOpenTags.push(pos); + LinkedList<Span> lineSpans = new LinkedList<>(); + getLineSpans(sm, lineSpans); + sm.removeManagedList(lineSpans); + return (parseContentElement(sm, cepp, lineSpans, new ContentElement())); } - Span lastLinkSpan = new Span(sm.length() + 1, sm.length() + 1); - Link.type linkType = Link.type.INTERNAL; - - while (!linkOpenTags.empty()) { - int linkStartTag = linkOpenTags.pop(); - int linkEndTag = sm.indexOf("]]", linkStartTag); - if (linkEndTag == -1) { - continue; - } - - int linkOptionTag = sm.indexOf("|", linkStartTag, linkEndTag); - - int linkTextStart; - String linkTarget; - - if (linkOptionTag != -1) { - linkTextStart = linkOptionTag + 1; - linkTarget = sm.substring(new Span(linkStartTag + 2, - linkOptionTag).trim(sm)); - } else { - linkTextStart = linkStartTag + 2; - linkTarget = sm - .substring(new Span(linkStartTag + 2, linkEndTag) - .trim(sm)); - } - - // is is a regular link ? - if (linkTarget.contains(lineSeparator)) { - continue; - } - linkTarget = encodeWikistyle(linkTarget); - - // so it is a Link or image!!! - List<String> parameters; - - String namespace = getLinkNameSpace(linkTarget); - if (namespace != null) { - if (imageIdentifers.indexOf(namespace) != -1) { - if (linkOptionTag != -1) { - int temp; - while ((temp = sm.indexOf("|", linkTextStart, - linkEndTag)) != -1) { - linkTextStart = temp + 1; - } - - parameters = tokenize(sm, linkOptionTag + 1, - linkEndTag, "|"); - - // maybe there is an external link at the end of the - // image description... - if (sm.charAt(linkEndTag + 2) == ']' - && sm.indexOf("[", linkTextStart, linkEndTag) != -1) { - linkEndTag++; - } - } else { - parameters = null; - } - linkType = Link.type.IMAGE; - } else { - //Link has namespace but is not image - linkType = Link.type.UNKNOWN; - parameters = null; - } - } else { - if (linkType == Link.type.INTERNAL - && lastLinkSpan.hits(new Span(linkStartTag, - linkEndTag + 2))) { - continue; - } - parameters = null; - linkType = Link.type.INTERNAL; - } - - Span posSpan = new Span(linkTextStart, linkEndTag).trim(sm); - linkSpans.add(posSpan); - - Link l = new Link(null, posSpan, linkTarget, linkType, parameters); - links.add(l); - - if (calculateSrcSpans) { - l.setSrcSpan(new SrcSpan(sm.getSrcPos(linkStartTag), sm - .getSrcPos(linkEndTag + 2))); - } - - sm.delete(posSpan.getEnd(), linkEndTag + 2); - sm.delete(linkStartTag, posSpan.getStart()); - - // removing line separators in link text - int lsinlink; - while ((lsinlink = sm.indexOf(lineSeparator, posSpan)) != -1) { - sm.replace(lsinlink, lsinlink + lineSeparator.length(), " "); - } - - lastLinkSpan = posSpan; - } - } - - /** - * Searches the Range given by the Span s for the double occurence of - * "quotation" and puts the results in the List quotedSpans. The Quotation - * tags will be deleted. - * - * @param sm , the Source in which will be searched - * @param s , the range in which will be searched - * @param quotedSpans , the List where the Spans will be placed, should be managed - * by the SpanManager sm - * @param quotation , the start and end tag as String - */ - private void parseQuotedSpans(SpanManager sm, Span s, - List<Span> quotedSpans, String quotation) { - - final int qlen = quotation.length(); - - // get the start position - int start = sm.indexOf(quotation, s.getStart(), s.getEnd()); - - while (start != -1) { - - // get the end position - int end = sm.indexOf(quotation, start + qlen, s.getEnd()); - if (end == -1) { - break; - } - - // build a new span from start and end position. - Span qs = new Span(start, end); - quotedSpans.add(qs); - - // calculate the original src positions. - if (calculateSrcSpans) { - qs.setSrcSpan(new SrcSpan(sm.getSrcPos(start), sm.getSrcPos(end - + qlen - 1) + 1)); - } - - // delete the tags. - sm.delete(end, end + qlen); - sm.delete(start, start + qlen); - - // get the next start position - start = sm.indexOf(quotation, qs.getEnd(), s.getEnd()); + /** + * Building a ContentElement from a single line. + */ + private ContentElement parseContentElement(SpanManager sm, ContentElementParsingParameters cepp, + Span lineSpan) + { + LinkedList<Span> lineSpans = new LinkedList<>(); + lineSpans.add(lineSpan); + return parseContentElement(sm, cepp, lineSpans, new ContentElement()); } - } - - /** - * Searches a line for Bold and Italic quotations, this has to be done - * linewhise. - */ - private void parseBoldAndItalicSpans(SpanManager sm, Span line, - List<Span> boldSpans, List<Span> italicSpans) { - // Das suchen nach BOLD und ITALIC muss in den Jeweiligen - // Zeilen geschenhen, da ein LineSeparator immer BOLD und - // Italic Tags schliesst. - - // Bold Spans - parseQuotedSpans(sm, line, boldSpans, "'''"); - - // Italic Spans - parseQuotedSpans(sm, line, italicSpans, "''"); - - // Maybe there is ONE SINGLE OPEN TAG left... handel these... - int openTag = sm.indexOf("''", line); - if (openTag != -1) { - // build a Span from this Tag. - Span qs = new Span(openTag, line.getEnd()); - - // calculate the original src positions. - if (calculateSrcSpans) { - qs.setSrcSpan(new SrcSpan(sm.getSrcPos(openTag), sm - .getSrcPos(line.getEnd()))); - } - - // is it a Bold or an Italic tag ? - if (sm.indexOf("'''", openTag, openTag + 3) != -1) { - // --> BOLD - boldSpans.add(qs); - sm.delete(openTag, openTag + 3); - } else { - // --> ITALIC - italicSpans.add(qs); - sm.delete(openTag, openTag + 2); - } - } - } - - private static String encodeWikistyle(String str) { - return str.replace(' ', '_'); - } - - /** - * Building a ContentElement from a String - */ - @Override - public ContentElement parseContentElement(String src) { - SpanManager sm = new SpanManager(src); - ContentElementParsingParameters cepp = new ContentElementParsingParameters(); - - parseImagesAndInternalLinks(sm, cepp.linkSpans, cepp.links); - - LinkedList<Span> lineSpans = new LinkedList<>(); - getLineSpans(sm, lineSpans); - sm.removeManagedList(lineSpans); - return (parseContentElement(sm, cepp, lineSpans, new ContentElement())); - } - - /** - * Building a ContentElement from a single line. - */ - private ContentElement parseContentElement(SpanManager sm, - ContentElementParsingParameters cepp, Span lineSpan) { - LinkedList<Span> lineSpans = new LinkedList<>(); - lineSpans.add(lineSpan); - return parseContentElement(sm, cepp, lineSpans, new ContentElement()); - } - - /** - * Building a ContentElement from a single line. But the result is given, so - * e.g. a NestedListElement can be filled with information... - */ - private ContentElement parseContentElement(SpanManager sm, - ContentElementParsingParameters cepp, Span lineSpan, - ContentElement result) { - LinkedList<Span> lineSpans = new LinkedList<>(); - lineSpans.add(lineSpan); - return parseContentElement(sm, cepp, lineSpans, result); - } - - /** - * Building a ContentElement, this funciton is calles by all the other - * parseContentElement(..) functions - */ - private ContentElement parseContentElement(SpanManager sm, - ContentElementParsingParameters cepp, LinkedList<Span> lineSpans, - ContentElement result) { - - List<Link> localLinks = new ArrayList<>(); - List<Template> localTemplates = new ArrayList<>(); - - List<Span> boldSpans = new ArrayList<>(); - List<Span> italicSpans = new ArrayList<>(); - sm.manageList(boldSpans); - sm.manageList(italicSpans); - - List<Span> managedSpans = new ArrayList<>(); - sm.manageList(managedSpans); - - Span contentElementRange = new Span(lineSpans.getFirst().getStart(), - lineSpans.getLast().getEnd()).trim(sm); - managedSpans.add(contentElementRange); - - // set the SrcSpan - if (calculateSrcSpans) { - result.setSrcSpan(new SrcSpan(sm.getSrcPos(contentElementRange - .getStart()), sm.getSrcPos(contentElementRange.getEnd()))); + + /** + * Building a ContentElement from a single line. But the result is given, so e.g. a + * NestedListElement can be filled with information... + */ + private ContentElement parseContentElement(SpanManager sm, ContentElementParsingParameters cepp, + Span lineSpan, ContentElement result) + { + LinkedList<Span> lineSpans = new LinkedList<>(); + lineSpans.add(lineSpan); + return parseContentElement(sm, cepp, lineSpans, result); } - sm.manageList(lineSpans); - while (!lineSpans.isEmpty()) { - Span line = lineSpans.getFirst(); + /** + * Building a ContentElement, this funciton is calles by all the other parseContentElement(..) + * functions + */ + private ContentElement parseContentElement(SpanManager sm, ContentElementParsingParameters cepp, + LinkedList<Span> lineSpans, ContentElement result) + { - parseBoldAndItalicSpans(sm, line, boldSpans, italicSpans); + List<Link> localLinks = new ArrayList<>(); + List<Template> localTemplates = new ArrayList<>(); - // External links - parseExternalLinks(sm, line, "http://", managedSpans, localLinks, - result); - parseExternalLinks(sm, line, "https://", managedSpans, localLinks, - result); - parseExternalLinks(sm, line, "ftp://", managedSpans, localLinks, - result); - parseExternalLinks(sm, line, "mailto:", managedSpans, localLinks, - result); + List<Span> boldSpans = new ArrayList<>(); + List<Span> italicSpans = new ArrayList<>(); + sm.manageList(boldSpans); + sm.manageList(italicSpans); - // end of linewhise opperations - lineSpans.removeFirst(); - } - sm.removeManagedList(lineSpans); - - // Links - int i; - i = 0; - while (i < cepp.linkSpans.size()) { - if (contentElementRange.hits(cepp.linkSpans.get(i))) { - Span linkSpan = cepp.linkSpans.remove(i); - managedSpans.add(linkSpan); - Link l = cepp.links.remove(i).setHomeElement(result); - localLinks.add(l); - if (!showImageText && l.getType() == Link.type.IMAGE) { - // deletes the Image Text from the ContentElement Text. - sm.delete(linkSpan); - } - } else { - i++; - } - } + List<Span> managedSpans = new ArrayList<>(); + sm.manageList(managedSpans); - // Templates - i = 0; - while (i < cepp.templateSpans.size()) { - Span ts = cepp.templateSpans.get(i); - if (contentElementRange.hits(ts)) { - ResolvedTemplate rt = cepp.templates.remove(i); - - if (rt.getPostParseReplacement() != null) { - sm.replace(ts, rt.getPostParseReplacement()); - } - cepp.templateSpans.remove(i); - - Object parsedObject = rt.getParsedObject(); - if (parsedObject != null) { - managedSpans.add(ts); - - Class<?> parsedObjectClass = parsedObject.getClass(); - if (parsedObjectClass == Template.class) { - localTemplates.add((Template) parsedObject); - } else if (parsedObjectClass == Link.class) { - localLinks.add(((Link) parsedObject) - .setHomeElement(result)); - } else { - localTemplates.add(rt.getTemplate()); - } - } - } else { - i++; - } - } + Span contentElementRange = new Span(lineSpans.getFirst().getStart(), + lineSpans.getLast().getEnd()).trim(sm); + managedSpans.add(contentElementRange); - // HTML/XML Tags - i = 0; - List<Span> tags = new ArrayList<>(); - while (i < cepp.tagSpans.size()) { - Span s = cepp.tagSpans.get(i); - if (contentElementRange.hits(s)) { - cepp.tagSpans.remove(i); - if (deleteTags) { - sm.delete(s); - } else { - tags.add(s); - managedSpans.add(s); - } - } else { - i++; - } - } + // set the SrcSpan + if (calculateSrcSpans) { + result.setSrcSpan(new SrcSpan(sm.getSrcPos(contentElementRange.getStart()), + sm.getSrcPos(contentElementRange.getEnd()))); + } - // noWiki - i = 0; - List<Span> localNoWikiSpans = new ArrayList<>(); - while (i < cepp.noWikiSpans.size()) { - Span s = cepp.noWikiSpans.get(i); - if (contentElementRange.hits(s)) { - cepp.noWikiSpans.remove(i); - sm.replace(s, cepp.noWikiStrings.remove(i)); - localNoWikiSpans.add(s); - managedSpans.add(s); - } else { - i++; - } - } + sm.manageList(lineSpans); + while (!lineSpans.isEmpty()) { + Span line = lineSpans.getFirst(); - // MATH Tags - i = 0; - List<Span> mathSpans = new ArrayList<>(); - while (i < cepp.mathSpans.size()) { - Span s = cepp.mathSpans.get(i); - if (contentElementRange.hits(s)) { - cepp.mathSpans.remove(i); - - if (showMathTagContent) { - mathSpans.add(s); - managedSpans.add(s); - sm.replace(s, cepp.mathStrings.remove(i)); - } else { - sm.delete(s); - } - } else { - i++; - } - } + parseBoldAndItalicSpans(sm, line, boldSpans, italicSpans); - result.setText(sm.substring(contentElementRange)); + // External links + parseExternalLinks(sm, line, "http://", managedSpans, localLinks, result); + parseExternalLinks(sm, line, "https://", managedSpans, localLinks, result); + parseExternalLinks(sm, line, "ftp://", managedSpans, localLinks, result); + parseExternalLinks(sm, line, "mailto:", managedSpans, localLinks, result); - // managed spans must be removed here and not earlier, because every - // change in the SpanManager affects the Spans! - sm.removeManagedList(boldSpans); - sm.removeManagedList(italicSpans); - sm.removeManagedList(managedSpans); + // end of linewhise opperations + lineSpans.removeFirst(); + } + sm.removeManagedList(lineSpans); + + // Links + int i; + i = 0; + while (i < cepp.linkSpans.size()) { + if (contentElementRange.hits(cepp.linkSpans.get(i))) { + Span linkSpan = cepp.linkSpans.remove(i); + managedSpans.add(linkSpan); + Link l = cepp.links.remove(i).setHomeElement(result); + localLinks.add(l); + if (!showImageText && l.getType() == Link.type.IMAGE) { + // deletes the Image Text from the ContentElement Text. + sm.delete(linkSpan); + } + } + else { + i++; + } + } - // contentElementRange ist auch noch in managedSpans !!! deswegen: - final int adjust = -contentElementRange.getStart(); - for (Span s : boldSpans) { - s.adjust(adjust); - } - for (Span s : italicSpans) { - s.adjust(adjust); - } - for (Span s : managedSpans) { - s.adjust(adjust); - } + // Templates + i = 0; + while (i < cepp.templateSpans.size()) { + Span ts = cepp.templateSpans.get(i); + if (contentElementRange.hits(ts)) { + ResolvedTemplate rt = cepp.templates.remove(i); + + if (rt.getPostParseReplacement() != null) { + sm.replace(ts, rt.getPostParseReplacement()); + } + cepp.templateSpans.remove(i); + + Object parsedObject = rt.getParsedObject(); + if (parsedObject != null) { + managedSpans.add(ts); + + Class<?> parsedObjectClass = parsedObject.getClass(); + if (parsedObjectClass == Template.class) { + localTemplates.add((Template) parsedObject); + } + else if (parsedObjectClass == Link.class) { + localLinks.add(((Link) parsedObject).setHomeElement(result)); + } + else { + localTemplates.add(rt.getTemplate()); + } + } + } + else { + i++; + } + } - result.setFormatSpans(FormatType.BOLD, boldSpans); - result.setFormatSpans(FormatType.ITALIC, italicSpans); - result.setFormatSpans(FormatType.TAG, tags); - result.setFormatSpans(FormatType.MATH, mathSpans); - result.setFormatSpans(FormatType.NOWIKI, localNoWikiSpans); - - result.setLinks(sortLinks(localLinks)); - result.setTemplates(sortTemplates(localTemplates)); - - return result; - } - - /** - * Sorts the Links... - */ - private static List<Link> sortLinks(List<Link> links) { - List<Link> result = new ArrayList<>(); - for (Link l : links) { - int pos = 0; - while (pos < result.size() - && l.getPos().getStart() > result.get(pos).getPos() - .getStart()) { - pos++; - } - result.add(pos, l); - } - return result; - } - - /** - * Sorts the Templates... - */ - private static List<Template> sortTemplates(List<Template> templates) { - List<Template> result = new ArrayList<>(); - for (Template t : templates) { - int pos = 0; - while (pos < result.size() - && t.getPos().getStart() > result.get(pos).getPos() - .getStart()) { - pos++; - } - result.add(pos, t); - } - return result; - } - - /** - * Algorithm to identify the first paragraph of a ParsedPage - */ - private void setFirstParagraph(ParsedPage pp) { - int nr = pp.nrOfParagraphs(); - - // the paragraph with the lowest number, must not be the first, maybe it - // is only an Image... - for (int i = 0; i < nr; i++) { - Paragraph p = pp.getParagraph(i); - - // get the Text from the paragraph - SpanManager ptext = new SpanManager(p.getText()); - List<Span> delete = new ArrayList<>(); - ptext.manageList(delete); - - // getting the spans to remove from the text, for templates - List<Template> tl = p.getTemplates(); - for (int j = tl.size() - 1; j >= 0; j--) { - delete.add(tl.get(j).getPos()); - } - - // getting the spans to remove from the text, for Tags - List<Span> sl = p.getFormatSpans(FormatType.TAG); - for (int j = sl.size() - 1; j >= 0; j--) { - delete.add(sl.get(j)); - } - - // getting the spans to remove from the text, for image text - if (showImageText) { - List<Link> ll = p.getLinks(Link.type.IMAGE); - for (int j = ll.size() - 1; j >= 0; j--) { - delete.add(ll.get(j).getPos()); - } - } - - // delete the spans in reverse order, the spans are managed, so - // there is no need to sort them - for (int j = delete.size() - 1; j >= 0; j--) { - ptext.delete(delete.remove(j)); - } - - // removing line separators if exist, so the result can be trimmed - // in the next step - int pos = ptext.indexOf(lineSeparator); - while (pos != -1) { - ptext.delete(pos, pos + lineSeparator.length()); - pos = ptext.indexOf(lineSeparator); - } - - // if the result is not an empty string, we got the number of the - // first paragraph - if (!ptext.toString().trim().equals("")) { - pp.setFirstParagraphNr(i); - return; - } + // HTML/XML Tags + i = 0; + List<Span> tags = new ArrayList<>(); + while (i < cepp.tagSpans.size()) { + Span s = cepp.tagSpans.get(i); + if (contentElementRange.hits(s)) { + cepp.tagSpans.remove(i); + if (deleteTags) { + sm.delete(s); + } + else { + tags.add(s); + managedSpans.add(s); + } + } + else { + i++; + } + } + + // noWiki + i = 0; + List<Span> localNoWikiSpans = new ArrayList<>(); + while (i < cepp.noWikiSpans.size()) { + Span s = cepp.noWikiSpans.get(i); + if (contentElementRange.hits(s)) { + cepp.noWikiSpans.remove(i); + sm.replace(s, cepp.noWikiStrings.remove(i)); + localNoWikiSpans.add(s); + managedSpans.add(s); + } + else { + i++; + } + } + + // MATH Tags + i = 0; + List<Span> mathSpans = new ArrayList<>(); + while (i < cepp.mathSpans.size()) { + Span s = cepp.mathSpans.get(i); + if (contentElementRange.hits(s)) { + cepp.mathSpans.remove(i); + + if (showMathTagContent) { + mathSpans.add(s); + managedSpans.add(s); + sm.replace(s, cepp.mathStrings.remove(i)); + } + else { + sm.delete(s); + } + } + else { + i++; + } + } + + result.setText(sm.substring(contentElementRange)); + + // managed spans must be removed here and not earlier, because every + // change in the SpanManager affects the Spans! + sm.removeManagedList(boldSpans); + sm.removeManagedList(italicSpans); + sm.removeManagedList(managedSpans); + + // contentElementRange ist auch noch in managedSpans !!! deswegen: + final int adjust = -contentElementRange.getStart(); + for (Span s : boldSpans) { + s.adjust(adjust); + } + for (Span s : italicSpans) { + s.adjust(adjust); + } + for (Span s : managedSpans) { + s.adjust(adjust); + } + + result.setFormatSpans(FormatType.BOLD, boldSpans); + result.setFormatSpans(FormatType.ITALIC, italicSpans); + result.setFormatSpans(FormatType.TAG, tags); + result.setFormatSpans(FormatType.MATH, mathSpans); + result.setFormatSpans(FormatType.NOWIKI, localNoWikiSpans); + + result.setLinks(sortLinks(localLinks)); + result.setTemplates(sortTemplates(localTemplates)); + + return result; + } + + /** + * Sorts the Links... + */ + private static List<Link> sortLinks(List<Link> links) + { + List<Link> result = new ArrayList<>(); + for (Link l : links) { + int pos = 0; + while (pos < result.size() + && l.getPos().getStart() > result.get(pos).getPos().getStart()) { + pos++; + } + result.add(pos, l); + } + return result; + } + + /** + * Sorts the Templates... + */ + private static List<Template> sortTemplates(List<Template> templates) + { + List<Template> result = new ArrayList<>(); + for (Template t : templates) { + int pos = 0; + while (pos < result.size() + && t.getPos().getStart() > result.get(pos).getPos().getStart()) { + pos++; + } + result.add(pos, t); + } + return result; + } + + /** + * Algorithm to identify the first paragraph of a ParsedPage + */ + private void setFirstParagraph(ParsedPage pp) + { + int nr = pp.nrOfParagraphs(); + + // the paragraph with the lowest number, must not be the first, maybe it + // is only an Image... + for (int i = 0; i < nr; i++) { + Paragraph p = pp.getParagraph(i); + + // get the Text from the paragraph + SpanManager ptext = new SpanManager(p.getText()); + List<Span> delete = new ArrayList<>(); + ptext.manageList(delete); + + // getting the spans to remove from the text, for templates + List<Template> tl = p.getTemplates(); + for (int j = tl.size() - 1; j >= 0; j--) { + delete.add(tl.get(j).getPos()); + } + + // getting the spans to remove from the text, for Tags + List<Span> sl = p.getFormatSpans(FormatType.TAG); + for (int j = sl.size() - 1; j >= 0; j--) { + delete.add(sl.get(j)); + } + + // getting the spans to remove from the text, for image text + if (showImageText) { + List<Link> ll = p.getLinks(Link.type.IMAGE); + for (int j = ll.size() - 1; j >= 0; j--) { + delete.add(ll.get(j).getPos()); + } + } + + // delete the spans in reverse order, the spans are managed, so + // there is no need to sort them + for (int j = delete.size() - 1; j >= 0; j--) { + ptext.delete(delete.remove(j)); + } + + // removing line separators if exist, so the result can be trimmed + // in the next step + int pos = ptext.indexOf(lineSeparator); + while (pos != -1) { + ptext.delete(pos, pos + lineSeparator.length()); + pos = ptext.indexOf(lineSeparator); + } + + // if the result is not an empty string, we got the number of the + // first paragraph + if (!ptext.toString().trim().equals("")) { + pp.setFirstParagraphNr(i); + return; + } + } } - } - - /** - * Container for all the Parameters needed in the parseing process - */ - class ContentElementParsingParameters { - final List<Span> noWikiSpans; - final List<String> noWikiStrings; - final List<Span> linkSpans; - final List<Link> links; - final List<Span> templateSpans; - final List<ResolvedTemplate> templates; - final List<Span> tagSpans; - final List<Span> mathSpans; - final List<String> mathStrings; - - ContentElementParsingParameters() { - noWikiSpans = new ArrayList<>(); - noWikiStrings = new ArrayList<>(); - linkSpans = new ArrayList<>(); - links = new ArrayList<>(); - templateSpans = new ArrayList<>(); - templates = new ArrayList<>(); - tagSpans = new ArrayList<>(); - mathSpans = new ArrayList<>(); - mathStrings = new ArrayList<>(); + + /** + * Container for all the Parameters needed in the parseing process + */ + class ContentElementParsingParameters + { + final List<Span> noWikiSpans; + final List<String> noWikiStrings; + final List<Span> linkSpans; + final List<Link> links; + final List<Span> templateSpans; + final List<ResolvedTemplate> templates; + final List<Span> tagSpans; + final List<Span> mathSpans; + final List<String> mathStrings; + + ContentElementParsingParameters() + { + noWikiSpans = new ArrayList<>(); + noWikiStrings = new ArrayList<>(); + linkSpans = new ArrayList<>(); + links = new ArrayList<>(); + templateSpans = new ArrayList<>(); + templates = new ArrayList<>(); + tagSpans = new ArrayList<>(); + mathSpans = new ArrayList<>(); + mathStrings = new ArrayList<>(); + } } - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java index 9c7f0f57..29bd2313 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ParserConstants.java @@ -17,11 +17,12 @@ */ package org.dkpro.jwpl.parser.mediawiki; -public interface ParserConstants { +public interface ParserConstants +{ - /** - * Shortcut for System.getProperty("line.separator"). - */ - String LF = System.getProperty("line.separator"); + /** + * Shortcut for System.getProperty("line.separator"). + */ + String LF = System.getProperty("line.separator"); } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java index 9ebabd6b..83fda87a 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ResolvedTemplate.java @@ -19,95 +19,105 @@ import org.dkpro.jwpl.parser.Template; -public class ResolvedTemplate { +public class ResolvedTemplate +{ - public final static String TEMPLATESPACER = "(TEMPLATE)"; + public final static String TEMPLATESPACER = "(TEMPLATE)"; - private final Template template; - private String preParseReplacement; - private String postParseReplacement; + private final Template template; + private String preParseReplacement; + private String postParseReplacement; - /** - * is the Object which the Template Parser has been parsed, and will be - * integrated by the ContentElementParseing process. <br> - * If parsedObject == null, the template will be discarded... - */ - private Object parsedObject; + /** + * is the Object which the Template Parser has been parsed, and will be integrated by the + * ContentElementParseing process. <br> + * If parsedObject == null, the template will be discarded... + */ + private Object parsedObject; - /** - * Creates a new ResolvedTemplate linked to the original template. - * - * @param template the original template - */ - public ResolvedTemplate(Template template) { - this.template = template; - this.postParseReplacement = ""; - checkPreParseReplacement(); - } + /** + * Creates a new ResolvedTemplate linked to the original template. + * + * @param template + * the original template + */ + public ResolvedTemplate(Template template) + { + this.template = template; + this.postParseReplacement = ""; + checkPreParseReplacement(); + } - private void checkPreParseReplacement() { - if (preParseReplacement == null || preParseReplacement.length() == 0) { - preParseReplacement = TEMPLATESPACER; + private void checkPreParseReplacement() + { + if (preParseReplacement == null || preParseReplacement.length() == 0) { + preParseReplacement = TEMPLATESPACER; + } } - } - /** - * Will be called by the parser after the parsing process and will replace - * the TEXT which is within the bounds of the original template src. <br> - * If NULL is returned, the parser won't do anything. - */ - public String getPostParseReplacement() { - return postParseReplacement; - } + /** + * Will be called by the parser after the parsing process and will replace the TEXT which is + * within the bounds of the original template src. <br> + * If NULL is returned, the parser won't do anything. + */ + public String getPostParseReplacement() + { + return postParseReplacement; + } - /** - * Look at getPostParseReplacement... - */ - public void setPostParseReplacement(String postParseReplacement) { - this.postParseReplacement = postParseReplacement; - } + /** + * Look at getPostParseReplacement... + */ + public void setPostParseReplacement(String postParseReplacement) + { + this.postParseReplacement = postParseReplacement; + } - /** - * will be called by the parser before the Parsing process and replaces the original - * template code. MediaWiki code which is returned here, will be parsed.<br> - * length() > 0 ! empty stings would not be accepted. - */ - public String getPreParseReplacement() { - return preParseReplacement; - } + /** + * will be called by the parser before the Parsing process and replaces the original template + * code. MediaWiki code which is returned here, will be parsed.<br> + * length() > 0 ! empty stings would not be accepted. + */ + public String getPreParseReplacement() + { + return preParseReplacement; + } - /** - * Look at getPreParseReplacement... - */ - public void setPreParseReplacement(String preParseReplacement) { - this.preParseReplacement = preParseReplacement; - checkPreParseReplacement(); - } + /** + * Look at getPreParseReplacement... + */ + public void setPreParseReplacement(String preParseReplacement) + { + this.preParseReplacement = preParseReplacement; + checkPreParseReplacement(); + } - /** - * In case of an Error the Parser will use the Original Template - * as parsed object. - */ - public Template getTemplate() { - return template; - } + /** + * In case of an Error the Parser will use the Original Template as parsed object. + */ + public Template getTemplate() + { + return template; + } - /** - * Returns the Object which is representative for the Template Code. - * It can be a Template or any object the parser knows.<br> - * If the Template is e.g. a Link the Link will be returned here. - */ - public Object getParsedObject() { - return parsedObject; - } + /** + * Returns the Object which is representative for the Template Code. It can be a Template or any + * object the parser knows.<br> + * If the Template is e.g. a Link the Link will be returned here. + */ + public Object getParsedObject() + { + return parsedObject; + } - /** - * Look at getParsedObject for Details. - * - * @param parsedObject - */ - public void setParsedObject(Object parsedObject) { - this.parsedObject = parsedObject; - } + /** + * Look at getParsedObject for Details. + * + * @param parsedObject + */ + public void setParsedObject(Object parsedObject) + { + this.parsedObject = parsedObject; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java index 2be25ce0..045b252e 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/ShowTemplateNamesAndParameters.java @@ -21,34 +21,38 @@ import org.dkpro.jwpl.parser.Template; /** - * This TemplateParser simply shows the name of the Template with all - * parameters, without any exception. + * This TemplateParser simply shows the name of the Template with all parameters, without any + * exception. */ -public class ShowTemplateNamesAndParameters implements MediaWikiTemplateParser { +public class ShowTemplateNamesAndParameters + implements MediaWikiTemplateParser +{ - private final String templatePrefix = "TEMPLATE["; - private final String templatePostfix = "]"; - private final String parameterDivisor = ", "; + private final String templatePrefix = "TEMPLATE["; + private final String templatePostfix = "]"; + private final String parameterDivisor = ", "; - public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) { - ResolvedTemplate result = new ResolvedTemplate(t); - result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); + public ResolvedTemplate parseTemplate(Template t, ParsedPage pp) + { + ResolvedTemplate result = new ResolvedTemplate(t); + result.setPreParseReplacement(ResolvedTemplate.TEMPLATESPACER); - StringBuilder sb = new StringBuilder(); - sb.append(templatePrefix); - sb.append(t.getName() + parameterDivisor); - for (String s : t.getParameters()) { - sb.append(s + parameterDivisor); - } - sb.delete(sb.length() - parameterDivisor.length(), sb.length()); - sb.append(templatePostfix); - result.setPostParseReplacement(sb.toString()); + StringBuilder sb = new StringBuilder(); + sb.append(templatePrefix); + sb.append(t.getName() + parameterDivisor); + for (String s : t.getParameters()) { + sb.append(s + parameterDivisor); + } + sb.delete(sb.length() - parameterDivisor.length(), sb.length()); + sb.append(templatePostfix); + result.setPostParseReplacement(sb.toString()); - result.setParsedObject(t); - return result; - } + result.setParsedObject(t); + return result; + } - public String configurationInfo() { - return "shows the Template names and all parameters"; - } + public String configurationInfo() + { + return "shows the Template names and all parameters"; + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java index 49b11c95..dfa4550d 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SpanManager.java @@ -22,240 +22,277 @@ import org.dkpro.jwpl.parser.Span; - /** - * A Class which manages Spans which are related to a StringBuilder. - * With the SpanManager it is possible to work on a String (delete, insert, replace) - * with no need to adjust the Spans related to the StringBuilder manually. + * A Class which manages Spans which are related to a StringBuilder. With the SpanManager it is + * possible to work on a String (delete, insert, replace) with no need to adjust the Spans related + * to the StringBuilder manually. */ -public class SpanManager implements CharSequence { - - private final StringBuilder sb; - private final List<List<Span>> managedLists; - - private List<Integer> ib; - private boolean calculateSrcPositions; - - /** - * Creates a new SpanManager with src as base. - * - * @param src - */ - public SpanManager(String src) { - sb = new StringBuilder(src); - managedLists = new ArrayList<>(); - calculateSrcPositions = false; - } - - /** - * Enables the Calculation of Src Position. The base for these position - * will be the aktual, not the initial, String wich is uses as Base for - * the SpanManager. - */ - public void enableSrcPosCalculation() { - calculateSrcPositions = true; - final int len = sb.length(); - ib = new ArrayList<>(len); - for (int i = 0; i < len; i++) ib.add(i); - } - - /** - * Retruns a SrcPos for the index of the aktual SpanManager base. - * - * @return the Position the index has, when enableSrcPosCaulation() has been called, - * or -1 if it is not possible. - */ - public int getSrcPos(int index) { - if (calculateSrcPositions) { - return ib.get(index); - } else { - System.err.println("SrcSpanCalculation not enabled!"); - return -1; +public class SpanManager + implements CharSequence +{ + + private final StringBuilder sb; + private final List<List<Span>> managedLists; + + private List<Integer> ib; + private boolean calculateSrcPositions; + + /** + * Creates a new SpanManager with src as base. + * + * @param src + */ + public SpanManager(String src) + { + sb = new StringBuilder(src); + managedLists = new ArrayList<>(); + calculateSrcPositions = false; + } + + /** + * Enables the Calculation of Src Position. The base for these position will be the aktual, not + * the initial, String wich is uses as Base for the SpanManager. + */ + public void enableSrcPosCalculation() + { + calculateSrcPositions = true; + final int len = sb.length(); + ib = new ArrayList<>(len); + for (int i = 0; i < len; i++) + ib.add(i); + } + + /** + * Retruns a SrcPos for the index of the aktual SpanManager base. + * + * @return the Position the index has, when enableSrcPosCaulation() has been called, or -1 if it + * is not possible. + */ + public int getSrcPos(int index) + { + if (calculateSrcPositions) { + return ib.get(index); + } + else { + System.err.println("SrcSpanCalculation not enabled!"); + return -1; + } + } + + /** + * Adds a List of Spans, which should be managed. + */ + public void manageList(List<Span> spans) + { + managedLists.add(spans); } - } - - /** - * Adds a List of Spans, which should be managed. - */ - public void manageList(List<Span> spans) { - managedLists.add(spans); - } - - /** - * Removes a List of Spans (not the Spans in the List), which shouldn�t be managed anymore. - * - * @param spans - */ - public void removeManagedList(List<Span> spans) { - final Span listIdentifer = new Span(Integer.MAX_VALUE, Integer.MIN_VALUE); - spans.add(listIdentifer); - managedLists.remove(spans); - spans.remove(listIdentifer); - } - - private void adjustLists(int offset, int n) { - for (List<Span> list : managedLists) - for (Span s : list) s.adjust(offset, n); - } - - /** - * Deletes the content between s.getStart() (included) and s.getEnd() (excluded). - */ - public SpanManager delete(Span s) { - return delete(s.getStart(), s.getEnd()); - } - - /** - * Deletes the content between start (included) and end (excluded). - */ - public SpanManager delete(int start, int end) { - sb.delete(start, end); - adjustLists(start, start - end); - - if (calculateSrcPositions) for (int i = 0; i < end - start; i++) ib.remove(start); - - return this; - } - - /** - * Insterts a String at the position offset. - */ - public SpanManager insert(int offset, String str) { - sb.insert(offset, str); - adjustLists(offset, str.length()); - - if (calculateSrcPositions) for (int i = 0; i < str.length(); i++) ib.add(offset, -1); - - return this; - } - - /** - * Replaces the content between s.getStart() (included) and s.getEnd() (excluded) with - * a String - */ - public SpanManager replace(Span s, String str) { - return replace(s.getStart(), s.getEnd(), str); - } - - /** - * Replaces the content between start (included) and end (excluded) with a String - */ - public SpanManager replace(int start, int end, String str) { - sb.replace(start, end, str); - - if (calculateSrcPositions) { - for (int i = 0; i < end - start; i++) ib.remove(start); - for (int i = 0; i < str.length(); i++) ib.add(start, -1); + + /** + * Removes a List of Spans (not the Spans in the List), which shouldn�t be managed anymore. + * + * @param spans + */ + public void removeManagedList(List<Span> spans) + { + final Span listIdentifer = new Span(Integer.MAX_VALUE, Integer.MIN_VALUE); + spans.add(listIdentifer); + managedLists.remove(spans); + spans.remove(listIdentifer); + } + + private void adjustLists(int offset, int n) + { + for (List<Span> list : managedLists) + for (Span s : list) + s.adjust(offset, n); } - adjustLists(start, str.length() - (end - start)); - return this; - } + /** + * Deletes the content between s.getStart() (included) and s.getEnd() (excluded). + */ + public SpanManager delete(Span s) + { + return delete(s.getStart(), s.getEnd()); + } + + /** + * Deletes the content between start (included) and end (excluded). + */ + public SpanManager delete(int start, int end) + { + sb.delete(start, end); + adjustLists(start, start - end); + + if (calculateSrcPositions) + for (int i = 0; i < end - start; i++) + ib.remove(start); + + return this; + } + + /** + * Insterts a String at the position offset. + */ + public SpanManager insert(int offset, String str) + { + sb.insert(offset, str); + adjustLists(offset, str.length()); + + if (calculateSrcPositions) + for (int i = 0; i < str.length(); i++) + ib.add(offset, -1); + + return this; + } + + /** + * Replaces the content between s.getStart() (included) and s.getEnd() (excluded) with a String + */ + public SpanManager replace(Span s, String str) + { + return replace(s.getStart(), s.getEnd(), str); + } - public int indexOf(String str) { - return this.indexOf(str, 0); - } + /** + * Replaces the content between start (included) and end (excluded) with a String + */ + public SpanManager replace(int start, int end, String str) + { + sb.replace(start, end, str); + + if (calculateSrcPositions) { + for (int i = 0; i < end - start; i++) + ib.remove(start); + for (int i = 0; i < str.length(); i++) + ib.add(start, -1); + } - public int indexOf(String str, int fromIndex) { - return sb.indexOf(str, fromIndex); - } + adjustLists(start, str.length() - (end - start)); + return this; + } - public int indexOf(String str, Span s) { - return indexOf(str, s.getStart(), s.getEnd()); - } + public int indexOf(String str) + { + return this.indexOf(str, 0); + } - public int indexOf(String str, int fromIndex, int toIndex) { - int result = sb.indexOf(str, fromIndex); - if (result >= toIndex) return -1; - return result; - } + public int indexOf(String str, int fromIndex) + { + return sb.indexOf(str, fromIndex); + } - public String substring(int start) { - if (start < 0) { - start = 0; + public int indexOf(String str, Span s) + { + return indexOf(str, s.getStart(), s.getEnd()); } - return this.sb.substring(start); - } - public String substring(int start, int end) { - if (start < 0) { - start = 0; + public int indexOf(String str, int fromIndex, int toIndex) + { + int result = sb.indexOf(str, fromIndex); + if (result >= toIndex) + return -1; + return result; } - if (start > end) { - return ""; + + public String substring(int start) + { + if (start < 0) { + start = 0; + } + return this.sb.substring(start); } - return sb.substring(start, end); - } + public String substring(int start, int end) + { + if (start < 0) { + start = 0; + } + if (start > end) { + return ""; + } - public String substring(Span s) { - if (s.getStart() < s.getEnd()) { - return sb.substring(s.getStart(), s.getEnd()); - } else { - return ""; + return sb.substring(start, end); } - } - - /** - * <font color="#ff0000">This function is not implemented !!!</font> - */ - public CharSequence subSequence(int start, int end) { - //TODO Implementieren - System.err.println("CharSequence subSequence(int start, int end)\nSorry, not Implemented"); - sb.charAt(-1); //causes an error - return null; - } - - public int length() { - return sb.length(); - } - - public SpanManager setCharAt(int index, char c) { - sb.setCharAt(index, c); - if (calculateSrcPositions) ib.set(index, -1); - return this; - } - - public char charAt(int index) { - return sb.charAt(index); - } - - @Override - public String toString() { - return sb.toString(); - } - - /** - * Returnes some information about the content of the SpanManager an it�s manages - * Spans - */ - public String info() { - StringBuilder result = new StringBuilder(); - - result.append("\n-=SPANMANAGER=----------------------------------------------------------------\n"); - - result.append("TEXT:"); - result.append("\"" + sb + "\""); - result.append("\n"); - - result.append("\nMANAGED SPAN LISTS:"); - if (managedLists.isEmpty()) - result.append(" NONE\n"); - else { - result.append("\n"); - for (int k = 0; k < managedLists.size(); k++) { - List<Span> sl = managedLists.get(k); - result.append("{"); - if (sl.size() != 0) { - for (int i = 1; i < sl.size() - 1; i++) result.append(sl.get(i) + ", "); - result.append(sl.get(sl.size() - 1)); + + public String substring(Span s) + { + if (s.getStart() < s.getEnd()) { + return sb.substring(s.getStart(), s.getEnd()); + } + else { + return ""; } - result.append("}\n"); - } } - result.append("------------------------------------------------------------------------------"); + /** + * <font color="#ff0000">This function is not implemented !!!</font> + */ + public CharSequence subSequence(int start, int end) + { + // TODO Implementieren + System.err.println("CharSequence subSequence(int start, int end)\nSorry, not Implemented"); + sb.charAt(-1); // causes an error + return null; + } - return result.toString(); - } + public int length() + { + return sb.length(); + } + + public SpanManager setCharAt(int index, char c) + { + sb.setCharAt(index, c); + if (calculateSrcPositions) + ib.set(index, -1); + return this; + } + + public char charAt(int index) + { + return sb.charAt(index); + } + + @Override + public String toString() + { + return sb.toString(); + } + + /** + * Returnes some information about the content of the SpanManager an it�s manages Spans + */ + public String info() + { + StringBuilder result = new StringBuilder(); + + result.append( + "\n-=SPANMANAGER=----------------------------------------------------------------\n"); + + result.append("TEXT:"); + result.append("\"" + sb + "\""); + result.append("\n"); + + result.append("\nMANAGED SPAN LISTS:"); + if (managedLists.isEmpty()) + result.append(" NONE\n"); + else { + result.append("\n"); + for (int k = 0; k < managedLists.size(); k++) { + List<Span> sl = managedLists.get(k); + result.append("{"); + if (sl.size() != 0) { + for (int i = 1; i < sl.size() - 1; i++) + result.append(sl.get(i) + ", "); + result.append(sl.get(sl.size() - 1)); + } + result.append("}\n"); + } + } + + result.append( + "------------------------------------------------------------------------------"); + + return result.toString(); + } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java index 327f91fc..328d38ea 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/mediawiki/SrcPosRangeChecker.java @@ -38,124 +38,142 @@ import org.dkpro.jwpl.parser.Template; /** - * Checks the Range of the SrcSpans of a ParsedPage, so it isn't possible - * that e.g. a ContentElement conatins a Link which isn't in the Range of - * this ContentElement. This must be done because some positons will be - * jammed by the parsing process, e.g. if a Link is the start of a Paragrah. + * Checks the Range of the SrcSpans of a ParsedPage, so it isn't possible that e.g. a ContentElement + * conatins a Link which isn't in the Range of this ContentElement. This must be done because some + * positons will be jammed by the parsing process, e.g. if a Link is the start of a Paragrah. */ -public class SrcPosRangeChecker { - - public static void checkRange(ParsedPage pp) { - for (Section s : pp.getSections()) { - if (s.getClass() == SectionContent.class) - checkRange((SectionContent) s); - else - checkRange((SectionContainer) s); +public class SrcPosRangeChecker +{ + + public static void checkRange(ParsedPage pp) + { + for (Section s : pp.getSections()) { + if (s.getClass() == SectionContent.class) + checkRange((SectionContent) s); + else + checkRange((SectionContainer) s); + } } - } - private static void checkRange(SectionContainer sc) { - if (sc.getTitleElement() != null) - checkRange(sc.getTitleElement()); - - for (Section s : sc.getSubSections()) { - if (s.getClass() == SectionContent.class) - checkRange((SectionContent) s); - else - checkRange((SectionContainer) s); + private static void checkRange(SectionContainer sc) + { + if (sc.getTitleElement() != null) + checkRange(sc.getTitleElement()); + + for (Section s : sc.getSubSections()) { + if (s.getClass() == SectionContent.class) + checkRange((SectionContent) s); + else + checkRange((SectionContainer) s); + } } - } - private static void checkRange(SectionContent s) { - List<SrcSpan> eil = new ArrayList<>(); + private static void checkRange(SectionContent s) + { + List<SrcSpan> eil = new ArrayList<>(); - if (s.getTitleElement() != null) { - checkRange(s.getTitleElement()); - eil.add(s.getTitleElement().getSrcSpan()); - } + if (s.getTitleElement() != null) { + checkRange(s.getTitleElement()); + eil.add(s.getTitleElement().getSrcSpan()); + } - for (Paragraph p : s.getParagraphs()) { - checkRange(p); - eil.add(p.getSrcSpan()); - } + for (Paragraph p : s.getParagraphs()) { + checkRange(p); + eil.add(p.getSrcSpan()); + } - for (DefinitionList dl : s.getDefinitionLists()) { - checkRange(dl); - eil.add(dl.getSrcSpan()); - } + for (DefinitionList dl : s.getDefinitionLists()) { + checkRange(dl); + eil.add(dl.getSrcSpan()); + } - for (NestedListContainer nl : s.getNestedLists()) { - checkRange(nl); - eil.add(nl.getSrcSpan()); - } + for (NestedListContainer nl : s.getNestedLists()) { + checkRange(nl); + eil.add(nl.getSrcSpan()); + } - for (Table t : s.getTables()) { - checkRange(t); - eil.add(t.getSrcSpan()); - } + for (Table t : s.getTables()) { + checkRange(t); + eil.add(t.getSrcSpan()); + } - s.setSrcSpan(getEvalInfo(s.getSrcSpan(), eil)); - } + s.setSrcSpan(getEvalInfo(s.getSrcSpan(), eil)); + } - private static void checkRange(DefinitionList dl) { + private static void checkRange(DefinitionList dl) + { - } + } - private static void checkRange(NestedListContainer nlc) { - for (NestedList nl : nlc.getNestedLists()) { - if (nl.getClass() == NestedListContainer.class) - checkRange((NestedListContainer) nl); - else - checkRange((ContentElement) nl); + private static void checkRange(NestedListContainer nlc) + { + for (NestedList nl : nlc.getNestedLists()) { + if (nl.getClass() == NestedListContainer.class) + checkRange((NestedListContainer) nl); + else + checkRange((ContentElement) nl); + } } - } - private static void checkRange(Table t) { - List<SrcSpan> eil = new ArrayList<>(); + private static void checkRange(Table t) + { + List<SrcSpan> eil = new ArrayList<>(); + + for (int i = 0; i < t.nrOfTableElements(); i++) { + TableElement te = t.getTableElement(i); + checkRange(te); + eil.add(te.getSrcSpan()); + } - for (int i = 0; i < t.nrOfTableElements(); i++) { - TableElement te = t.getTableElement(i); - checkRange(te); - eil.add(te.getSrcSpan()); + t.setSrcSpan(getEvalInfo(t.getSrcSpan(), eil)); } - t.setSrcSpan(getEvalInfo(t.getSrcSpan(), eil)); - } + private static void checkRange(TableElement te) + { + List<SrcSpan> eil = new ArrayList<>(); - private static void checkRange(TableElement te) { - List<SrcSpan> eil = new ArrayList<>(); + for (Section s : te.getSubSections()) { + if (s.getClass() == SectionContent.class) + checkRange((SectionContent) s); + else + checkRange((SectionContainer) s); + } + + te.setSrcSpan(getEvalInfo(te.getSrcSpan(), eil)); + } - for (Section s : te.getSubSections()) { - if (s.getClass() == SectionContent.class) - checkRange((SectionContent) s); - else - checkRange((SectionContainer) s); + private static void checkRange(ContentElement ce) + { + List<SrcSpan> eil = new ArrayList<>(); + for (Span s : ce.getFormatSpans(FormatType.BOLD)) + eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.ITALIC)) + eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.MATH)) + eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.TAG)) + eil.add(s.getSrcSpan()); + for (Span s : ce.getFormatSpans(FormatType.NOWIKI)) + eil.add(s.getSrcSpan()); + for (Link l : ce.getLinks()) + eil.add(l.getSrcSpan()); + for (Template t : ce.getTemplates()) + eil.add(t.getSrcSpan()); + + ce.setSrcSpan(getEvalInfo(ce.getSrcSpan(), eil)); } - te.setSrcSpan(getEvalInfo(te.getSrcSpan(), eil)); - } - - private static void checkRange(ContentElement ce) { - List<SrcSpan> eil = new ArrayList<>(); - for (Span s : ce.getFormatSpans(FormatType.BOLD)) eil.add(s.getSrcSpan()); - for (Span s : ce.getFormatSpans(FormatType.ITALIC)) eil.add(s.getSrcSpan()); - for (Span s : ce.getFormatSpans(FormatType.MATH)) eil.add(s.getSrcSpan()); - for (Span s : ce.getFormatSpans(FormatType.TAG)) eil.add(s.getSrcSpan()); - for (Span s : ce.getFormatSpans(FormatType.NOWIKI)) eil.add(s.getSrcSpan()); - for (Link l : ce.getLinks()) eil.add(l.getSrcSpan()); - for (Template t : ce.getTemplates()) eil.add(t.getSrcSpan()); - - ce.setSrcSpan(getEvalInfo(ce.getSrcSpan(), eil)); - } - - private static SrcSpan getEvalInfo(SrcSpan e, List<SrcSpan> eil) { - int start = e.getStart(); - int end = e.getEnd(); - - for (SrcSpan ei : eil) { - if (start == -1 || (start > ei.getStart() && ei.getStart() != -1)) start = ei.getStart(); - if (end < ei.getEnd()) end = ei.getEnd(); + private static SrcSpan getEvalInfo(SrcSpan e, List<SrcSpan> eil) + { + int start = e.getStart(); + int end = e.getEnd(); + + for (SrcSpan ei : eil) { + if (start == -1 || (start > ei.getStart() && ei.getStart() != -1)) + start = ei.getStart(); + if (end < ei.getEnd()) + end = ei.getEnd(); + } + return new SrcSpan(start, end); } - return new SrcSpan(start, end); - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java index 939abe0f..e8915cb7 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/ConfigLoader.java @@ -25,92 +25,120 @@ import org.xml.sax.Attributes; import org.xml.sax.helpers.DefaultHandler; -class ConfigLoader extends DefaultHandler { - final SelectiveAccessHandler sah; +class ConfigLoader + extends DefaultHandler +{ + final SelectiveAccessHandler sah; - private EnumMap<CIT, Boolean> citm; - private EnumMap<SIT, EnumMap<CIT, Boolean>> sitm; - private Attributes secatt; + private EnumMap<CIT, Boolean> citm; + private EnumMap<SIT, EnumMap<CIT, Boolean>> sitm; + private Attributes secatt; - private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling; + private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling; - public ConfigLoader(SelectiveAccessHandler sah) { - this.sah = sah; - } + public ConfigLoader(SelectiveAccessHandler sah) + { + this.sah = sah; + } - public void startElement(String uri, String localName, String qName, Attributes att) { - if (localName.equalsIgnoreCase("cit")) { - citm = SelectiveAccessHandler.buildCITMap( - "true".equalsIgnoreCase(att.getValue("text")), - "true".equalsIgnoreCase(att.getValue("bold")), - "true".equalsIgnoreCase(att.getValue("italic")), - "true".equalsIgnoreCase(att.getValue("link")) - ); - } else if (localName.equalsIgnoreCase("section")) { - sitm = new EnumMap<>(SIT.class); - secatt = att; - } else if (localName.equalsIgnoreCase(SIT.SUBS.toString())) { - citm = null; - } else if (localName.equalsIgnoreCase(SIT.TITLE.toString())) { - citm = null; - } else if (localName.equalsIgnoreCase(SIT.DEFLIST.toString())) { - citm = null; - } else if (localName.equalsIgnoreCase(SIT.TABLE.toString())) { - citm = null; - } else if (localName.equalsIgnoreCase(SIT.NESTLIST.toString())) { - citm = null; - } else if (localName.equalsIgnoreCase(SIT.PARA.toString())) { - citm = null; - } else if (localName.equalsIgnoreCase("page")) { - citm = null; - } else if (localName.equalsIgnoreCase("firstParagraph")) { - citm = null; - } else if (localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")) { - sah.setPageHandling(null); - sah.setFirstParagraphHandling(null); - sectionHandling = sah.getSectionHandling(); - sectionHandling.clear(); - } else { - System.err.println("UnhandledElement: " + localName); + public void startElement(String uri, String localName, String qName, Attributes att) + { + if (localName.equalsIgnoreCase("cit")) { + citm = SelectiveAccessHandler.buildCITMap("true".equalsIgnoreCase(att.getValue("text")), + "true".equalsIgnoreCase(att.getValue("bold")), + "true".equalsIgnoreCase(att.getValue("italic")), + "true".equalsIgnoreCase(att.getValue("link"))); + } + else if (localName.equalsIgnoreCase("section")) { + sitm = new EnumMap<>(SIT.class); + secatt = att; + } + else if (localName.equalsIgnoreCase(SIT.SUBS.toString())) { + citm = null; + } + else if (localName.equalsIgnoreCase(SIT.TITLE.toString())) { + citm = null; + } + else if (localName.equalsIgnoreCase(SIT.DEFLIST.toString())) { + citm = null; + } + else if (localName.equalsIgnoreCase(SIT.TABLE.toString())) { + citm = null; + } + else if (localName.equalsIgnoreCase(SIT.NESTLIST.toString())) { + citm = null; + } + else if (localName.equalsIgnoreCase(SIT.PARA.toString())) { + citm = null; + } + else if (localName.equalsIgnoreCase("page")) { + citm = null; + } + else if (localName.equalsIgnoreCase("firstParagraph")) { + citm = null; + } + else if (localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")) { + sah.setPageHandling(null); + sah.setFirstParagraphHandling(null); + sectionHandling = sah.getSectionHandling(); + sectionHandling.clear(); + } + else { + System.err.println("UnhandledElement: " + localName); + } } - } - public void endElement(String uri, String localName, String qName) { - if (localName.equalsIgnoreCase("cit")) { - // do nothing... - } else if (localName.equalsIgnoreCase("section")) { - String name = secatt.getValue("name"); + public void endElement(String uri, String localName, String qName) + { + if (localName.equalsIgnoreCase("cit")) { + // do nothing... + } + else if (localName.equalsIgnoreCase("section")) { + String name = secatt.getValue("name"); - if (name != null) - if (name.startsWith(SelectiveAccessHandler.SectionType.DEFAULT_SECTION.toString()) || - name.startsWith(SelectiveAccessHandler.SectionType.SECTION_LEVEL.toString()) || - name.startsWith(SelectiveAccessHandler.SectionType.USER_SECTION.toString())) - sectionHandling.put(name, sitm); - else - sectionHandling.put(SelectiveAccessHandler.SectionType.USER_SECTION + name, sitm); - else - sah.setDefaultSectionHandling(sitm); + if (name != null) + if (name.startsWith(SelectiveAccessHandler.SectionType.DEFAULT_SECTION.toString()) + || name.startsWith( + SelectiveAccessHandler.SectionType.SECTION_LEVEL.toString()) + || name.startsWith( + SelectiveAccessHandler.SectionType.USER_SECTION.toString())) + sectionHandling.put(name, sitm); + else + sectionHandling.put(SelectiveAccessHandler.SectionType.USER_SECTION + name, + sitm); + else + sah.setDefaultSectionHandling(sitm); - } else if (localName.equalsIgnoreCase(SIT.SUBS.toString())) { - sitm.put(SIT.SUBS, citm); - } else if (localName.equalsIgnoreCase(SIT.TITLE.toString())) { - sitm.put(SIT.TITLE, citm); - } else if (localName.equalsIgnoreCase(SIT.TABLE.toString())) { - sitm.put(SIT.TABLE, citm); - } else if (localName.equalsIgnoreCase(SIT.DEFLIST.toString())) { - sitm.put(SIT.DEFLIST, citm); - } else if (localName.equalsIgnoreCase(SIT.NESTLIST.toString())) { - sitm.put(SIT.NESTLIST, citm); - } else if (localName.equalsIgnoreCase(SIT.PARA.toString())) { - sitm.put(SIT.PARA, citm); - } else if (localName.equalsIgnoreCase("page")) { - sah.setPageHandling(citm); - } else if (localName.equalsIgnoreCase("firstParagraph")) { - sah.setFirstParagraphHandling(citm); - } else if (localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")) { + } + else if (localName.equalsIgnoreCase(SIT.SUBS.toString())) { + sitm.put(SIT.SUBS, citm); + } + else if (localName.equalsIgnoreCase(SIT.TITLE.toString())) { + sitm.put(SIT.TITLE, citm); + } + else if (localName.equalsIgnoreCase(SIT.TABLE.toString())) { + sitm.put(SIT.TABLE, citm); + } + else if (localName.equalsIgnoreCase(SIT.DEFLIST.toString())) { + sitm.put(SIT.DEFLIST, citm); + } + else if (localName.equalsIgnoreCase(SIT.NESTLIST.toString())) { + sitm.put(SIT.NESTLIST, citm); + } + else if (localName.equalsIgnoreCase(SIT.PARA.toString())) { + sitm.put(SIT.PARA, citm); + } + else if (localName.equalsIgnoreCase("page")) { + sah.setPageHandling(citm); + } + else if (localName.equalsIgnoreCase("firstParagraph")) { + sah.setFirstParagraphHandling(citm); + } + else if (localName.equalsIgnoreCase("SelectiveAccessHandlerConfig")) { - } else { - System.err.println("UnhandledElement: " + localName); + } + else { + System.err.println("UnhandledElement: " + localName); + } } - } } diff --git a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java index 23f54558..b66635fe 100644 --- a/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java +++ b/dkpro-jwpl-parser/src/main/java/org/dkpro/jwpl/parser/selectiveaccess/SelectiveAccessHandler.java @@ -45,358 +45,413 @@ /** * Provides access to a ParsedPage at an abstract Level. */ -public class SelectiveAccessHandler { - - enum CIT {TEXT, BOLD, ITALIC, LINK} - - enum SIT {SUBS, TITLE, TABLE, DEFLIST, NESTLIST, PARA} - - protected enum SectionType {DEFAULT_SECTION, SECTION_LEVEL, USER_SECTION} - - private EnumMap<CIT, Boolean> firstParagraphHandling; - private EnumMap<CIT, Boolean> pageHandling; - private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling; - private int levelModifier = 0; - - /** - * Creates an SelectiveAccessHandler... ready to config... - */ - public SelectiveAccessHandler() { - loadConfig(); - } - - /** - * Creates an SelectiveAccessHandler and loads the config from an XMLFile - */ - public SelectiveAccessHandler(String XMLFile) { - loadConfig(XMLFile); - } - - public static EnumMap<CIT, Boolean> buildCITMap(boolean text, boolean bold, boolean italic, boolean link) { - EnumMap<CIT, Boolean> result = new EnumMap<>(CIT.class); - result.put(CIT.TEXT, text); - result.put(CIT.BOLD, bold); - result.put(CIT.ITALIC, italic); - result.put(CIT.LINK, link); - return result; - } - - public static EnumMap<SIT, EnumMap<CIT, Boolean>> buildSITMap(EnumMap<CIT, Boolean> subs, EnumMap<CIT, Boolean> title, EnumMap<CIT, Boolean> table, EnumMap<CIT, Boolean> deflist, EnumMap<CIT, Boolean> nestedlist, EnumMap<CIT, Boolean> paragraph) { - EnumMap<SIT, EnumMap<CIT, Boolean>> result = new EnumMap<>(SIT.class); - result.put(SIT.SUBS, subs); - result.put(SIT.TITLE, title); - result.put(SIT.TABLE, table); - result.put(SIT.DEFLIST, deflist); - result.put(SIT.NESTLIST, nestedlist); - result.put(SIT.PARA, paragraph); - return result; - } - - /** - * if firstParagraphHandling is null, there will be no special handling for the FirstParagraph... - */ - public void setFirstParagraphHandling(EnumMap<CIT, Boolean> firstParagraphHandling) { - this.firstParagraphHandling = firstParagraphHandling; - } - - /** - * if pageHandling is null, there will be no special handling for the WHOLE PAGE, this means, the handling will be sectionwhise... - */ - public void setPageHandling(EnumMap<CIT, Boolean> pageHandling) { - this.pageHandling = pageHandling; - } - - /** - * @return the sectionHandling - */ - public Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> getSectionHandling() { - return sectionHandling; - } - - /** - * Be sure to set the Default Section Handling to avoid errors... - */ - public void setSectionHandling(Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling) { - this.sectionHandling = sectionHandling; - } - - /** - * adds section handling for a specified relative level... - */ - public void addSectionHandling(int level, EnumMap<SIT, EnumMap<CIT, Boolean>> sh) { - sectionHandling.put(SectionType.SECTION_LEVEL.toString() + level, sh); - } - - /** - * adds section handling for a specila section name... - */ - public void addSectionHandling(String name, EnumMap<SIT, EnumMap<CIT, Boolean>> sh) { - sectionHandling.put(SectionType.USER_SECTION + name.toUpperCase(), sh); - } - - /** - * sets the section handling for all sections which are not set by level or name... - */ - public void setDefaultSectionHandling(EnumMap<SIT, EnumMap<CIT, Boolean>> sh) { - sectionHandling.put(SectionType.DEFAULT_SECTION.toString(), sh); - } - - /** - * Returns information which infomations are selected by the actual configuration - */ - public String getSelectionInfo() { - StringBuilder result = new StringBuilder(); - - result.append("SelectionInfo: " + this.getClass() + "\n"); - result.append("Page:" + CITInfo(pageHandling) + "\n"); - result.append("FirstParagraph:" + CITInfo(firstParagraphHandling) + "\n"); - for (String key : sectionHandling.keySet()) { - final String uss = SectionType.USER_SECTION.toString(); - if (key.startsWith(uss)) - result.append(uss + "[" + key.substring(uss.length()) + "]:\n"); - else - result.append(key + ":\n"); - - result.append(SITInfo(sectionHandling.get(key)) + "\n"); +public class SelectiveAccessHandler +{ + + enum CIT + { + TEXT, BOLD, ITALIC, LINK + } + + enum SIT + { + SUBS, TITLE, TABLE, DEFLIST, NESTLIST, PARA + } + + protected enum SectionType + { + DEFAULT_SECTION, SECTION_LEVEL, USER_SECTION + } + + private EnumMap<CIT, Boolean> firstParagraphHandling; + private EnumMap<CIT, Boolean> pageHandling; + private Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling; + private int levelModifier = 0; + + /** + * Creates an SelectiveAccessHandler... ready to config... + */ + public SelectiveAccessHandler() + { + loadConfig(); } - return result.toString(); - } - - /** - * Converts a CITMap into a human readable String - */ - public static String CITInfo(EnumMap<CIT, Boolean> hp) { - StringBuilder result = new StringBuilder(); - result.append("["); - if (hp != null) { - for (CIT key : hp.keySet()) - result.append(key.toString() + ":" + hp.get(key) + ", "); - result.delete(result.length() - 2, result.length()); + /** + * Creates an SelectiveAccessHandler and loads the config from an XMLFile + */ + public SelectiveAccessHandler(String XMLFile) + { + loadConfig(XMLFile); } - result.append("]"); - return result.toString(); - } - - /** - * Converts a SITMap into a human readable String - */ - public static String SITInfo(EnumMap<SIT, EnumMap<CIT, Boolean>> shp) { - StringBuilder result = new StringBuilder(); - for (SIT key : shp.keySet()) { - result.append("\t" + key.toString() + ":" + CITInfo(shp.get(key)) + "\n"); + + public static EnumMap<CIT, Boolean> buildCITMap(boolean text, boolean bold, boolean italic, + boolean link) + { + EnumMap<CIT, Boolean> result = new EnumMap<>(CIT.class); + result.put(CIT.TEXT, text); + result.put(CIT.BOLD, bold); + result.put(CIT.ITALIC, italic); + result.put(CIT.LINK, link); + return result; + } + + public static EnumMap<SIT, EnumMap<CIT, Boolean>> buildSITMap(EnumMap<CIT, Boolean> subs, + EnumMap<CIT, Boolean> title, EnumMap<CIT, Boolean> table, EnumMap<CIT, Boolean> deflist, + EnumMap<CIT, Boolean> nestedlist, EnumMap<CIT, Boolean> paragraph) + { + EnumMap<SIT, EnumMap<CIT, Boolean>> result = new EnumMap<>(SIT.class); + result.put(SIT.SUBS, subs); + result.put(SIT.TITLE, title); + result.put(SIT.TABLE, table); + result.put(SIT.DEFLIST, deflist); + result.put(SIT.NESTLIST, nestedlist); + result.put(SIT.PARA, paragraph); + return result; + } + + /** + * if firstParagraphHandling is null, there will be no special handling for the + * FirstParagraph... + */ + public void setFirstParagraphHandling(EnumMap<CIT, Boolean> firstParagraphHandling) + { + this.firstParagraphHandling = firstParagraphHandling; + } + + /** + * if pageHandling is null, there will be no special handling for the WHOLE PAGE, this means, + * the handling will be sectionwhise... + */ + public void setPageHandling(EnumMap<CIT, Boolean> pageHandling) + { + this.pageHandling = pageHandling; } - return result.toString(); - } - private void deleteParagraph(int nr, List<Section> sections) { - int temp = nr; + /** + * @return the sectionHandling + */ + public Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> getSectionHandling() + { + return sectionHandling; + } - for (Section s : sections) { - nr = temp; - temp -= s.nrOfParagraphs(); + /** + * Be sure to set the Default Section Handling to avoid errors... + */ + public void setSectionHandling(Map<String, EnumMap<SIT, EnumMap<CIT, Boolean>>> sectionHandling) + { + this.sectionHandling = sectionHandling; + } - if (temp >= 0) continue; + /** + * adds section handling for a specified relative level... + */ + public void addSectionHandling(int level, EnumMap<SIT, EnumMap<CIT, Boolean>> sh) + { + sectionHandling.put(SectionType.SECTION_LEVEL.toString() + level, sh); + } - if (s.getClass() == SectionContainer.class) - deleteParagraph(nr, ((SectionContainer) s).getSubSections()); - else { - SectionContent sc = (SectionContent) s; - sc.removeParagraph(sc.getParagraph(nr)); - } + /** + * adds section handling for a specila section name... + */ + public void addSectionHandling(String name, EnumMap<SIT, EnumMap<CIT, Boolean>> sh) + { + sectionHandling.put(SectionType.USER_SECTION + name.toUpperCase(), sh); + } - break; + /** + * sets the section handling for all sections which are not set by level or name... + */ + public void setDefaultSectionHandling(EnumMap<SIT, EnumMap<CIT, Boolean>> sh) + { + sectionHandling.put(SectionType.DEFAULT_SECTION.toString(), sh); } - } - - /** - * Returns the Information of a ParsedPage which are selected by the actual configuration - */ - public String getSelectedText(ParsedPage pp) { - if (pp == null) return null; - - StringBuilder sb = new StringBuilder(); - - levelModifier = pp.getSection(0).getLevel() - 1; - - if (pageHandling == null) { - if (firstParagraphHandling != null) { - handleContent(pp.getFirstParagraph(), firstParagraphHandling, sb); - deleteParagraph(pp.getFirstParagraphNr(), pp.getSections()); - } - for (Section s : pp.getSections()) - handleSection(s, sb); - } else { - if (pageHandling.get(CIT.TEXT)) { - sb.append(pp.getText()); - } else { - if (pageHandling.get(CIT.BOLD)) { - handleSpans(pp.getFormatSpans(FormatType.BOLD), pp.getText(), sb); + + /** + * Returns information which infomations are selected by the actual configuration + */ + public String getSelectionInfo() + { + StringBuilder result = new StringBuilder(); + + result.append("SelectionInfo: " + this.getClass() + "\n"); + result.append("Page:" + CITInfo(pageHandling) + "\n"); + result.append("FirstParagraph:" + CITInfo(firstParagraphHandling) + "\n"); + for (String key : sectionHandling.keySet()) { + final String uss = SectionType.USER_SECTION.toString(); + if (key.startsWith(uss)) + result.append(uss + "[" + key.substring(uss.length()) + "]:\n"); + else + result.append(key + ":\n"); + + result.append(SITInfo(sectionHandling.get(key)) + "\n"); } - if (pageHandling.get(CIT.ITALIC)) { - handleSpans(pp.getFormatSpans(FormatType.ITALIC), pp.getText(), sb); + + return result.toString(); + } + + /** + * Converts a CITMap into a human readable String + */ + public static String CITInfo(EnumMap<CIT, Boolean> hp) + { + StringBuilder result = new StringBuilder(); + result.append("["); + if (hp != null) { + for (CIT key : hp.keySet()) + result.append(key.toString() + ":" + hp.get(key) + ", "); + result.delete(result.length() - 2, result.length()); } - } + result.append("]"); + return result.toString(); + } - if (pageHandling.get(CIT.LINK)) - handleLinks(pp.getLinks(), !pageHandling.get(CIT.TEXT), sb); + /** + * Converts a SITMap into a human readable String + */ + public static String SITInfo(EnumMap<SIT, EnumMap<CIT, Boolean>> shp) + { + StringBuilder result = new StringBuilder(); + for (SIT key : shp.keySet()) { + result.append("\t" + key.toString() + ":" + CITInfo(shp.get(key)) + "\n"); + } + return result.toString(); } - return sb.toString().trim(); - } - - private static void handleContent(Content c, EnumMap<CIT, Boolean> hp, StringBuilder sb) { - if (hp != null) { - if (hp.get(CIT.TEXT)) - sb.append(c.getText() + " "); - else { - if (hp.get(CIT.BOLD)) - handleSpans(c.getFormatSpans(FormatType.BOLD), c.getText(), sb); - if (hp.get(CIT.ITALIC)) - handleSpans(c.getFormatSpans(FormatType.ITALIC), c.getText(), sb); - } - if (hp.get(CIT.LINK)) - handleLinks(c.getLinks(), !hp.get(CIT.TEXT), sb); + private void deleteParagraph(int nr, List<Section> sections) + { + int temp = nr; + + for (Section s : sections) { + nr = temp; + temp -= s.nrOfParagraphs(); + + if (temp >= 0) + continue; + + if (s.getClass() == SectionContainer.class) + deleteParagraph(nr, ((SectionContainer) s).getSubSections()); + else { + SectionContent sc = (SectionContent) s; + sc.removeParagraph(sc.getParagraph(nr)); + } + + break; + } } - } - private void handleSection(Section s, StringBuilder sb) { - EnumMap<SIT, EnumMap<CIT, Boolean>> hp = null; + /** + * Returns the Information of a ParsedPage which are selected by the actual configuration + */ + public String getSelectedText(ParsedPage pp) + { + if (pp == null) + return null; + + StringBuilder sb = new StringBuilder(); + + levelModifier = pp.getSection(0).getLevel() - 1; + + if (pageHandling == null) { + if (firstParagraphHandling != null) { + handleContent(pp.getFirstParagraph(), firstParagraphHandling, sb); + deleteParagraph(pp.getFirstParagraphNr(), pp.getSections()); + } + for (Section s : pp.getSections()) + handleSection(s, sb); + } + else { + if (pageHandling.get(CIT.TEXT)) { + sb.append(pp.getText()); + } + else { + if (pageHandling.get(CIT.BOLD)) { + handleSpans(pp.getFormatSpans(FormatType.BOLD), pp.getText(), sb); + } + if (pageHandling.get(CIT.ITALIC)) { + handleSpans(pp.getFormatSpans(FormatType.ITALIC), pp.getText(), sb); + } + } + + if (pageHandling.get(CIT.LINK)) + handleLinks(pp.getLinks(), !pageHandling.get(CIT.TEXT), sb); + } - if (s.getTitle() != null) hp = sectionHandling.get(SectionType.USER_SECTION + s.getTitle().toUpperCase()); - if (hp == null) hp = sectionHandling.get(SectionType.SECTION_LEVEL.toString() + (s.getLevel() - levelModifier)); - if (hp == null) hp = sectionHandling.get(SectionType.DEFAULT_SECTION.toString()); - if (hp == null) { - System.err.println("Cannot get Handling Parameters for Section:\"" + s.getTitle() + "\" Level:" + s.getLevel()); - return; + return sb.toString().trim(); } - handleContent(s.getTitleElement(), hp.get(SIT.TITLE), sb); - - if (s.getClass() == SectionContainer.class) { - if (hp.get(SIT.SUBS) != null) - handleContent(s, hp.get(SIT.SUBS), sb); - else - for (Section ss : ((SectionContainer) s).getSubSections()) - handleSection(ss, sb); - } else { - EnumMap<CIT, Boolean> hpx; - - hpx = hp.get(SIT.TABLE); - if (hpx != null) - for (Table t : s.getTables()) - handleContent(t, hpx, sb); - - hpx = hp.get(SIT.NESTLIST); - if (hpx != null) - for (NestedList nl : s.getNestedLists()) - handleContent(nl, hpx, sb); - - hpx = hp.get(SIT.PARA); - if (hpx != null) - for (Paragraph p : s.getParagraphs()) - handleContent(p, hpx, sb); - - hpx = hp.get(SIT.DEFLIST); - if (hpx != null) - for (DefinitionList dl : s.getDefinitionLists()) - handleContent(dl, hpx, sb); + private static void handleContent(Content c, EnumMap<CIT, Boolean> hp, StringBuilder sb) + { + if (hp != null) { + if (hp.get(CIT.TEXT)) + sb.append(c.getText() + " "); + else { + if (hp.get(CIT.BOLD)) + handleSpans(c.getFormatSpans(FormatType.BOLD), c.getText(), sb); + if (hp.get(CIT.ITALIC)) + handleSpans(c.getFormatSpans(FormatType.ITALIC), c.getText(), sb); + } + if (hp.get(CIT.LINK)) + handleLinks(c.getLinks(), !hp.get(CIT.TEXT), sb); + } } - } - - private static void handleSpans(List<Span> spans, String text, StringBuilder sb) { - for (Span s : spans) - sb.append(text.substring(s.getStart(), s.getEnd()) + " "); - } - - private static void handleLinks(List<Link> links, boolean linktext, StringBuilder sb) { - for (Link l : links) { - switch (l.getType()) { - case INTERNAL: - String lText = l.getText(); - String lTarget = l.getTarget(); - if (linktext) sb.append(lText + " "); - if (!lText.equals(lTarget)) sb.append(lTarget + " "); - break; - case EXTERNAL: - sb.append(l.getText() + " "); - break; - case IMAGE: - case AUDIO: - case VIDEO: - // do nothing ! - break; - } + + private void handleSection(Section s, StringBuilder sb) + { + EnumMap<SIT, EnumMap<CIT, Boolean>> hp = null; + + if (s.getTitle() != null) + hp = sectionHandling.get(SectionType.USER_SECTION + s.getTitle().toUpperCase()); + if (hp == null) + hp = sectionHandling + .get(SectionType.SECTION_LEVEL.toString() + (s.getLevel() - levelModifier)); + if (hp == null) + hp = sectionHandling.get(SectionType.DEFAULT_SECTION.toString()); + if (hp == null) { + System.err.println("Cannot get Handling Parameters for Section:\"" + s.getTitle() + + "\" Level:" + s.getLevel()); + return; + } + + handleContent(s.getTitleElement(), hp.get(SIT.TITLE), sb); + + if (s.getClass() == SectionContainer.class) { + if (hp.get(SIT.SUBS) != null) + handleContent(s, hp.get(SIT.SUBS), sb); + else + for (Section ss : ((SectionContainer) s).getSubSections()) + handleSection(ss, sb); + } + else { + EnumMap<CIT, Boolean> hpx; + + hpx = hp.get(SIT.TABLE); + if (hpx != null) + for (Table t : s.getTables()) + handleContent(t, hpx, sb); + + hpx = hp.get(SIT.NESTLIST); + if (hpx != null) + for (NestedList nl : s.getNestedLists()) + handleContent(nl, hpx, sb); + + hpx = hp.get(SIT.PARA); + if (hpx != null) + for (Paragraph p : s.getParagraphs()) + handleContent(p, hpx, sb); + + hpx = hp.get(SIT.DEFLIST); + if (hpx != null) + for (DefinitionList dl : s.getDefinitionLists()) + handleContent(dl, hpx, sb); + } } - } - - /** - * Loads the Default Config... (shows nothing at all, but ready to config...) - */ - private void loadConfig() { - firstParagraphHandling = null; - pageHandling = null; - sectionHandling = new HashMap<>(); - setDefaultSectionHandling(buildSITMap(buildCITMap(false, false, false, false), null, null, null, null, null)); - } - - /** - * Loads a Configuration from an XMLFile... - */ - public void loadConfig(String XMLFile) { - try { - sectionHandling = new HashMap<>(); - SAXParserFactory factory = SAXParserFactory.newInstance(); - factory.setNamespaceAware(true); - SAXParser sp = factory.newSAXParser(); - DefaultHandler handler = new ConfigLoader(this); - sp.parse(XMLFile, handler); - } catch (Exception e) { - System.err.println(e); - loadConfig(); + + private static void handleSpans(List<Span> spans, String text, StringBuilder sb) + { + for (Span s : spans) + sb.append(text.substring(s.getStart(), s.getEnd()) + " "); } - } - - private static String XMLCIT(EnumMap<CIT, Boolean> em) { - StringBuilder result = new StringBuilder(); - result.append("<cit"); - if (em != null) - for (CIT key : em.keySet()) - result.append(" " + key.toString() + "=\"" + em.get(key) + "\""); - result.append("/>"); - return result.toString(); - } - - private static String XMLSIT(EnumMap<SIT, EnumMap<CIT, Boolean>> sem) { - StringBuilder result = new StringBuilder(); - for (SIT key : sem.keySet()) { - result.append("<" + key.toString() + ">"); - result.append(XMLCIT(sem.get(key))); - result.append("</" + key + ">\n"); + + private static void handleLinks(List<Link> links, boolean linktext, StringBuilder sb) + { + for (Link l : links) { + switch (l.getType()) { + case INTERNAL: + String lText = l.getText(); + String lTarget = l.getTarget(); + if (linktext) + sb.append(lText + " "); + if (!lText.equals(lTarget)) + sb.append(lTarget + " "); + break; + case EXTERNAL: + sb.append(l.getText() + " "); + break; + case IMAGE: + case AUDIO: + case VIDEO: + // do nothing ! + break; + } + } } - return result.toString(); - } - - /** - * writes an XML configuration file... - */ - public void writeConfig(String XMLFile) { - try { - BufferedWriter bw = new BufferedWriter(new FileWriter(XMLFile)); - - bw.write("<SelectiveAccessHandlerConfig>\n"); - bw.write("<page>" + XMLCIT(pageHandling) + "</page>\n"); - bw.write("<firstparagraph>" + XMLCIT(pageHandling) + "</firstparagraph>\n"); - for (String key : sectionHandling.keySet()) { - bw.write("<section name=\"" + key + "\">\n"); - bw.write(XMLSIT(sectionHandling.get(key))); - bw.write("</section>\n"); - } - bw.write("<SelectiveAccessHandlerConfig>\n"); - - bw.close(); - } catch (IOException e) { - System.err.println(e); + + /** + * Loads the Default Config... (shows nothing at all, but ready to config...) + */ + private void loadConfig() + { + firstParagraphHandling = null; + pageHandling = null; + sectionHandling = new HashMap<>(); + setDefaultSectionHandling( + buildSITMap(buildCITMap(false, false, false, false), null, null, null, null, null)); + } + + /** + * Loads a Configuration from an XMLFile... + */ + public void loadConfig(String XMLFile) + { + try { + sectionHandling = new HashMap<>(); + SAXParserFactory factory = SAXParserFactory.newInstance(); + factory.setNamespaceAware(true); + SAXParser sp = factory.newSAXParser(); + DefaultHandler handler = new ConfigLoader(this); + sp.parse(XMLFile, handler); + } + catch (Exception e) { + System.err.println(e); + loadConfig(); + } + } + + private static String XMLCIT(EnumMap<CIT, Boolean> em) + { + StringBuilder result = new StringBuilder(); + result.append("<cit"); + if (em != null) + for (CIT key : em.keySet()) + result.append(" " + key.toString() + "=\"" + em.get(key) + "\""); + result.append("/>"); + return result.toString(); + } + + private static String XMLSIT(EnumMap<SIT, EnumMap<CIT, Boolean>> sem) + { + StringBuilder result = new StringBuilder(); + for (SIT key : sem.keySet()) { + result.append("<" + key.toString() + ">"); + result.append(XMLCIT(sem.get(key))); + result.append("</" + key + ">\n"); + } + return result.toString(); + } + + /** + * writes an XML configuration file... + */ + public void writeConfig(String XMLFile) + { + try { + BufferedWriter bw = new BufferedWriter(new FileWriter(XMLFile)); + + bw.write("<SelectiveAccessHandlerConfig>\n"); + bw.write("<page>" + XMLCIT(pageHandling) + "</page>\n"); + bw.write("<firstparagraph>" + XMLCIT(pageHandling) + "</firstparagraph>\n"); + for (String key : sectionHandling.keySet()) { + bw.write("<section name=\"" + key + "\">\n"); + bw.write(XMLSIT(sectionHandling.get(key))); + bw.write("</section>\n"); + } + bw.write("<SelectiveAccessHandlerConfig>\n"); + + bw.close(); + } + catch (IOException e) { + System.err.println(e); + } } - } } diff --git a/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/BaseJWPLTest.java b/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/BaseJWPLTest.java index 92a3291f..73df5305 100644 --- a/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/BaseJWPLTest.java +++ b/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/BaseJWPLTest.java @@ -22,25 +22,26 @@ import org.dkpro.jwpl.api.Wikipedia; /** - * Simple test base class to inject the same hsqldb test context into every test - * class to avoid duplicated code and efforts. Also shuts down the - * hibernate/hsqldb context properly. + * Simple test base class to inject the same hsqldb test context into every test class to avoid + * duplicated code and efforts. Also shuts down the hibernate/hsqldb context properly. * * @author mawiesne */ -public abstract class BaseJWPLTest { +public abstract class BaseJWPLTest +{ - protected static Wikipedia wiki; + protected static Wikipedia wiki; - protected static final DatabaseConfiguration obtainHSQLDBConfiguration() { - DatabaseConfiguration db = new DatabaseConfiguration(); - db.setDatabase("wikiapi_test"); - db.setHost("localhost"); - db.setUser("sa"); - db.setPassword(""); - db.setLanguage(Language._test); - db.setJdbcURL("jdbc:hsqldb:file:./src/test/resources/db/wikiapi_test"); - db.setDatabaseDriver("org.hsqldb.jdbcDriver"); - return db; - } + protected static final DatabaseConfiguration obtainHSQLDBConfiguration() + { + DatabaseConfiguration db = new DatabaseConfiguration(); + db.setDatabase("wikiapi_test"); + db.setHost("localhost"); + db.setUser("sa"); + db.setPassword(""); + db.setLanguage(Language._test); + db.setJdbcURL("jdbc:hsqldb:file:./src/test/resources/db/wikiapi_test"); + db.setDatabaseDriver("org.hsqldb.jdbcDriver"); + return db; + } } diff --git a/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java b/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java index 6aa765dc..7720726d 100644 --- a/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java +++ b/dkpro-jwpl-parser/src/test/java/org/dkpro/jwpl/parser/ParsedPageTest.java @@ -31,40 +31,43 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.Test; -public class ParsedPageTest extends BaseJWPLTest{ +public class ParsedPageTest + extends BaseJWPLTest +{ private static final String LF = "\n"; /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions */ @BeforeAll - public static void setupWikipedia() { + public static void setupWikipedia() + { DatabaseConfiguration db = obtainHSQLDBConfiguration(); try { wiki = new Wikipedia(db); - } catch (Exception e) { + } + catch (Exception e) { fail("Wikipedia could not be initialized: " + e.getLocalizedMessage(), e); } } @Test - public void testParsedPage(){ + public void testParsedPage() + { String title = "Wikipedia API"; Page p = null; try { p = wiki.getPage(title); - } catch (WikiApiException e) { + } + catch (WikiApiException e) { fail("A WikiApiException occurred while getting the page " + title, e); } - - String text = "Wikipedia API ist die wichtigste Software überhaupt." + LF + - "Wikipedia API. Nicht zu übertreffen. Unglaublich http://www.ukp.tu-darmstadt.de en:Wikipedia API"; - + String text = "Wikipedia API ist die wichtigste Software überhaupt." + LF + + "Wikipedia API. Nicht zu übertreffen. Unglaublich http://www.ukp.tu-darmstadt.de en:Wikipedia API"; MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); MediaWikiParser parser = pf.createParser(); @@ -72,12 +75,12 @@ public void testParsedPage(){ ParsedPage pp = parser.parse(p.getText()); assertNotNull(pp); - int i=0; + int i = 0; for (Link link : pp.getSection(0).getLinks()) { - if (i==0) { + if (i == 0) { assertEquals("Software", link.getText()); } - else if (i==1) { + else if (i == 1) { assertEquals("Wikipedia API", link.getText()); assertEquals("JWPL", link.getTarget()); } @@ -86,5 +89,5 @@ else if (i==1) { String parsedPageText = pp.getText(); assertNotNull(parsedPageText); assertEquals(text, parsedPageText); - } + } } From 37bff6b4ec903682830833c851312627ee758e3a Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho <richard.eckart@gmail.com> Date: Tue, 31 Oct 2023 14:27:01 +0100 Subject: [PATCH 09/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-revisionmachine --- .../api/AbstractRevisionService.java | 107 +- .../api/ChronoRevisionIterator.java | 686 ++-- .../jwpl/revisionmachine/api/Contributor.java | 78 +- .../jwpl/revisionmachine/api/Revision.java | 881 +++-- .../api/RevisionAPIConfiguration.java | 400 +- .../jwpl/revisionmachine/api/RevisionApi.java | 3440 +++++++++-------- .../api/RevisionDataInterface.java | 114 +- .../revisionmachine/api/RevisionIterator.java | 750 ++-- .../api/RevisionIteratorInterface.java | 24 +- .../api/chrono/ChronoFullRevision.java | 604 +-- .../api/chrono/ChronoIterator.java | 554 +-- .../api/chrono/ChronoStorage.java | 545 +-- .../api/chrono/ChronoStorageBlock.java | 397 +- .../archivers/Bzip2Archiver.java | 224 +- .../exceptions/ArticleReaderException.java | 64 +- .../exceptions/ConfigurationException.java | 112 +- .../common/exceptions/DecodingException.java | 64 +- .../common/exceptions/DiffException.java | 64 +- .../common/exceptions/EncodingException.java | 64 +- .../common/exceptions/ErrorFactory.java | 556 +-- .../common/exceptions/ErrorKeys.java | 76 +- .../common/exceptions/LoggingException.java | 61 +- .../exceptions/SQLConsumerException.java | 63 +- .../common/exceptions/TimeoutException.java | 63 +- .../common/logging/Logger.java | 447 ++- .../common/logging/LoggerType.java | 83 +- .../common/logging/LoggingFactory.java | 124 +- .../logging/messages/DiffToolLogMessages.java | 154 +- .../consumer/ArticleConsumerLogMessages.java | 418 +- .../consumer/ConsumerLogMessages.java | 297 +- .../consumer/DiffConsumerLogMessages.java | 235 +- .../consumer/SQLConsumerLogMessages.java | 117 +- .../common/util/LetterNode.java | 235 +- .../common/util/MathUtilities.java | 154 +- .../common/util/MultipleKeywordTree.java | 187 +- .../common/util/SingleKeywordTree.java | 138 +- .../common/util/Surrogates.java | 92 +- .../revisionmachine/common/util/Time.java | 309 +- .../common/util/WikipediaXMLKeys.java | 283 +- .../common/util/WikipediaXMLWriter.java | 480 +-- .../revisionmachine/difftool/DiffTool.java | 84 +- .../difftool/DiffToolThread.java | 476 +-- .../difftool/config/ConfigurationKeys.java | 537 ++- .../difftool/config/ConfigurationManager.java | 176 +- .../difftool/config/ConfigurationReader.java | 1411 +++---- .../difftool/config/OutputTypes.java | 27 +- .../difftool/config/gui/ConfigGUI.java | 89 +- .../difftool/config/gui/ConfigMenuBar.java | 75 +- .../config/gui/control/ArchiveRegistry.java | 253 +- .../config/gui/control/ComponentRegistry.java | 180 +- .../config/gui/control/ConfigController.java | 1131 +++--- .../config/gui/control/ConfigSettings.java | 372 +- .../gui/control/ConfigVerification.java | 204 +- .../difftool/config/gui/data/ConfigEnum.java | 19 +- .../config/gui/data/ConfigErrorKeys.java | 51 +- .../difftool/config/gui/data/ConfigItem.java | 107 +- .../config/gui/data/ConfigItemTypes.java | 19 +- .../gui/data/OutputCompressionEnum.java | 27 +- .../difftool/config/gui/data/PanelKeys.java | 75 +- .../config/gui/dialogs/ConfigDialog.java | 252 +- .../config/gui/dialogs/InputDialog.java | 340 +- .../config/gui/dialogs/XMLFileChooser.java | 57 +- .../config/gui/panels/AbstractPanel.java | 114 +- .../config/gui/panels/CachePanel.java | 453 ++- .../config/gui/panels/ConfigPanel.java | 222 +- .../config/gui/panels/DebugPanel.java | 419 +- .../gui/panels/ExternalProgramsPanel.java | 338 +- .../config/gui/panels/FilterPanel.java | 323 +- .../config/gui/panels/InputPanel.java | 715 ++-- .../config/gui/panels/LoggingPanel.java | 259 +- .../difftool/config/gui/panels/ModePanel.java | 369 +- .../config/gui/panels/OutputPanel.java | 673 ++-- .../difftool/config/gui/panels/SQLPanel.java | 547 +-- .../config/simpleconfig/SimpleConfig.java | 32 +- .../article/ArticleReaderInterface.java | 62 +- .../article/reader/ArticleFilter.java | 229 +- .../consumer/article/reader/InputFactory.java | 340 +- .../reader/TimedWikipediaXMLReader.java | 327 +- .../article/reader/WikipediaXMLReader.java | 1149 +++--- .../diff/DiffCalculatorInterface.java | 66 +- .../diff/TaskTransmitterInterface.java | 67 +- .../diff/calculation/BlockManagement.java | 480 +-- .../calculation/BlockManagementInterface.java | 33 +- .../consumer/diff/calculation/DiffBlock.java | 273 +- .../diff/calculation/DiffCalculator.java | 1256 +++--- .../diff/calculation/TimedDiffCalculator.java | 277 +- .../difftool/consumer/dump/SQLEscape.java | 116 +- .../consumer/dump/WriterInterface.java | 46 +- .../consumer/dump/codec/DataFileEncoder.java | 202 +- .../consumer/dump/codec/SQLEncoder.java | 747 ++-- .../dump/codec/SQLEncoderInterface.java | 119 +- .../consumer/dump/codec/SQLEncoding.java | 217 +- .../consumer/dump/codec/TimedSQLEncoder.java | 210 +- .../dump/writer/DataFileArchiveWriter.java | 336 +- .../consumer/dump/writer/DataFileWriter.java | 333 +- .../consumer/dump/writer/OutputFactory.java | 112 +- .../dump/writer/SQLArchiveWriter.java | 386 +- .../dump/writer/SQLDatabaseWriter.java | 281 +- .../consumer/dump/writer/SQLFileWriter.java | 362 +- .../dump/writer/TimedSQLArchiveWriter.java | 213 +- .../dump/writer/TimedSQLDatabaseWriter.java | 209 +- .../dump/writer/TimedSQLFileWriter.java | 206 +- .../difftool/data/OutputType.java | 91 +- .../difftool/data/SurrogateModes.java | 98 +- .../data/archive/ArchiveDescription.java | 136 +- .../difftool/data/archive/ArchiveManager.java | 105 +- .../difftool/data/archive/InputType.java | 81 +- .../difftool/data/codec/BitReader.java | 225 +- .../difftool/data/codec/BitWriter.java | 275 +- .../data/codec/RevisionCodecData.java | 448 +-- .../difftool/data/codec/RevisionDecoder.java | 915 ++--- .../difftool/data/codec/RevisionEncoder.java | 691 ++-- .../data/codec/RevisionEncoderInterface.java | 57 +- .../difftool/data/tasks/ISizeable.java | 15 +- .../difftool/data/tasks/Task.java | 473 +-- .../difftool/data/tasks/TaskTypes.java | 73 +- .../difftool/data/tasks/content/Diff.java | 628 +-- .../data/tasks/content/DiffAction.java | 173 +- .../difftool/data/tasks/content/DiffPart.java | 322 +- .../data/tasks/info/ArticleInformation.java | 894 +++-- .../revisionmachine/index/IndexGenerator.java | 352 +- .../revisionmachine/index/IndexIterator.java | 283 +- .../jwpl/revisionmachine/index/Indexer.java | 436 ++- .../index/indices/AbstractIndex.java | 223 +- .../index/indices/ArticleIndex.java | 178 +- .../index/indices/ArticleIndexData.java | 179 +- .../index/indices/ChronoIndex.java | 231 +- .../index/indices/ChronoIndexData.java | 243 +- .../index/indices/RevisionIndex.java | 81 +- .../index/writer/DataFileWriter.java | 154 +- .../index/writer/DatabaseWriter.java | 248 +- .../index/writer/IndexWriterInterface.java | 56 +- .../index/writer/SQLFileWriter.java | 187 +- .../jwpl/revisionmachine/BaseJWPLTest.java | 23 +- .../jwpl/revisionmachine/RevisionApiTest.java | 424 +- .../revisionmachine/RevisionIteratorTest.java | 211 +- 136 files changed, 22021 insertions(+), 20012 deletions(-) diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java index ee562855..4210c030 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/AbstractRevisionService.java @@ -28,63 +28,74 @@ /** * A common base class that handles the aspect of database connection handling. */ -public abstract class AbstractRevisionService { +public abstract class AbstractRevisionService +{ - private static final Logger logger = LoggerFactory.getLogger(AbstractRevisionService.class); + private static final Logger logger = LoggerFactory.getLogger(AbstractRevisionService.class); - /** - * Reference to database connection - */ - protected Connection connection; + /** + * Reference to database connection + */ + protected Connection connection; - /** - * Reference to the configuration parameters - */ - protected RevisionAPIConfiguration config; + /** + * Reference to the configuration parameters + */ + protected RevisionAPIConfiguration config; - /** - * Helper method to obtain a connection via the given {@link RevisionAPIConfiguration} parameter. - * - * @param config Must not be {@code null}. - * @return A valid {@link Connection} to the database endpoint. - * @throws WikiApiException Thrown if errors occurred while opening a connection. - */ - protected Connection getConnection(RevisionAPIConfiguration config) throws WikiApiException { - Connection c; - try { + /** + * Helper method to obtain a connection via the given {@link RevisionAPIConfiguration} + * parameter. + * + * @param config + * Must not be {@code null}. + * @return A valid {@link Connection} to the database endpoint. + * @throws WikiApiException + * Thrown if errors occurred while opening a connection. + */ + protected Connection getConnection(RevisionAPIConfiguration config) throws WikiApiException + { + Connection c; + try { - String driverDB = config.getDatabaseDriver(); - Class.forName(driverDB); + String driverDB = config.getDatabaseDriver(); + Class.forName(driverDB); - c = DriverManager.getConnection(config.getJdbcURL(), config.getUser(), config.getPassword()); - if (!c.isValid(5)) { - throw new WikiApiException("Connection could not be established."); - } - } catch (SQLException | ClassNotFoundException e) { - throw new WikiApiException(e); - } + c = DriverManager.getConnection(config.getJdbcURL(), config.getUser(), + config.getPassword()); + if (!c.isValid(5)) { + throw new WikiApiException("Connection could not be established."); + } + } + catch (SQLException | ClassNotFoundException e) { + throw new WikiApiException(e); + } - return c; - } + return c; + } - /** - * This method closes any open {@link Connection connections} to the database. - * - * @throws SQLException if an error occurs while closing the connection - */ - public final void close() throws SQLException { - if (this.connection != null) { - this.connection.close(); + /** + * This method closes any open {@link Connection connections} to the database. + * + * @throws SQLException + * if an error occurs while closing the connection + */ + public final void close() throws SQLException + { + if (this.connection != null) { + this.connection.close(); + } } - } - protected void reconnect() throws SQLException { - close(); - try { - this.connection = getConnection(config); - } catch (WikiApiException e) { - close(); - logger.error("Could not reconnect. Closing connection...", e); + protected void reconnect() throws SQLException + { + close(); + try { + this.connection = getConnection(config); + } + catch (WikiApiException e) { + close(); + logger.error("Could not reconnect. Closing connection...", e); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java index 04f0d271..b4c55470 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/ChronoRevisionIterator.java @@ -30,381 +30,401 @@ /** * This class represents the iteration in chronological order. */ -public class ChronoRevisionIterator implements RevisionIteratorInterface { - - /** - * Reference to the configuration parameters - */ - private final RevisionAPIConfiguration config; - - /** - * Reference to the database connection - */ - private final Connection connection; - - /** - * Reference to the currently used result set - */ - private ResultSet resultArticles; - - /** - * Number of revisions of the current read article - */ - private int maxRevision; - - /** - * Reference to the Revision Iterator - */ - private RevisionIterator revisionIterator; - - /** - * Reference to the ChronoIterator - */ - private ChronoIterator chronoIterator; - - /** - * Retrieval mode - */ - private int modus; - - /** - * Retrieval mode id - undefined - */ - private final static int INIT = 0; - - /** - * Retrieval mode id - article is in chronological order - */ - private final static int ITERATE_WITHOUT_MAPPING = 2; - - /** - * Retrieval mode id - article is not in chronological order - */ - private final static int ITERATE_WITH_MAPPING = 1; - - /** - * ID of the current article (Should be 0 to enable an iteration over all - * article) - */ - private int currentArticleID; - - /** - * ID of the last article to retrieve - */ - private int lastArticleID; - - /** - * Parameter - buffer size - */ - private final int MAX_NUMBER_RESULTS; - - /** - * (Constructor) Creates a new ChronoRevisionIterator - * - * @param config Reference to the configuration parameters - * @throws WikiApiException if an error occurs - */ - public ChronoRevisionIterator(final RevisionAPIConfiguration config) - throws WikiApiException { - - this.config = config; - try { - this.MAX_NUMBER_RESULTS = config.getBufferSize(); - - this.resultArticles = null; - this.currentArticleID = 0; - this.lastArticleID = -1; - - reset(); - - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); - - this.connection = DriverManager.getConnection("jdbc:mysql://" - + config.getHost() + "/" + config.getDatabase(), - config.getUser(), config.getPassword()); - - } catch (SQLException | ClassNotFoundException e) { - throw new WikiApiException(e); +public class ChronoRevisionIterator + implements RevisionIteratorInterface +{ + + /** + * Reference to the configuration parameters + */ + private final RevisionAPIConfiguration config; + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Reference to the currently used result set + */ + private ResultSet resultArticles; + + /** + * Number of revisions of the current read article + */ + private int maxRevision; + + /** + * Reference to the Revision Iterator + */ + private RevisionIterator revisionIterator; + + /** + * Reference to the ChronoIterator + */ + private ChronoIterator chronoIterator; + + /** + * Retrieval mode + */ + private int modus; + + /** + * Retrieval mode id - undefined + */ + private final static int INIT = 0; + + /** + * Retrieval mode id - article is in chronological order + */ + private final static int ITERATE_WITHOUT_MAPPING = 2; + + /** + * Retrieval mode id - article is not in chronological order + */ + private final static int ITERATE_WITH_MAPPING = 1; + + /** + * ID of the current article (Should be 0 to enable an iteration over all article) + */ + private int currentArticleID; + + /** + * ID of the last article to retrieve + */ + private int lastArticleID; + + /** + * Parameter - buffer size + */ + private final int MAX_NUMBER_RESULTS; + + /** + * (Constructor) Creates a new ChronoRevisionIterator + * + * @param config + * Reference to the configuration parameters + * @throws WikiApiException + * if an error occurs + */ + public ChronoRevisionIterator(final RevisionAPIConfiguration config) throws WikiApiException + { + + this.config = config; + try { + this.MAX_NUMBER_RESULTS = config.getBufferSize(); + + this.resultArticles = null; + this.currentArticleID = 0; + this.lastArticleID = -1; + + reset(); + + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + this.connection = DriverManager.getConnection( + "jdbc:mysql://" + config.getHost() + "/" + config.getDatabase(), + config.getUser(), config.getPassword()); + + } + catch (SQLException | ClassNotFoundException e) { + throw new WikiApiException(e); + } } - } - /** - * (Constructor) Creates a new ChronoRevisionIterator - * - * @param config Reference to the configuration parameters - * @throws WikiApiException if an error occurs - */ - public ChronoRevisionIterator(final RevisionAPIConfiguration config, - final int firstArticleID, final int lastArticleID) throws WikiApiException { + /** + * (Constructor) Creates a new ChronoRevisionIterator + * + * @param config + * Reference to the configuration parameters + * @throws WikiApiException + * if an error occurs + */ + public ChronoRevisionIterator(final RevisionAPIConfiguration config, final int firstArticleID, + final int lastArticleID) + throws WikiApiException + { + + this(config); + + this.currentArticleID = firstArticleID - 1; + this.lastArticleID = lastArticleID; + } - this(config); + /** + * Retrieves the next articles from the article index. + * + * @return whether the query contains results or not + * @throws SQLException + * if an error occurs while executing the query + */ + private boolean queryArticle() throws SQLException + { - this.currentArticleID = firstArticleID - 1; - this.lastArticleID = lastArticleID; - } + Statement statement = this.connection.createStatement(); - /** - * Retrieves the next articles from the article index. - * - * @return whether the query contains results or not - * @throws SQLException if an error occurs while executing the query - */ - private boolean queryArticle() throws SQLException { + String query = "SELECT ArticleID, FullRevisionPKs, RevisionCounter " + + "FROM index_articleID_rc_ts " + "WHERE articleID > " + this.currentArticleID + + " LIMIT " + MAX_NUMBER_RESULTS; - Statement statement = this.connection.createStatement(); + resultArticles = statement.executeQuery(query); - String query = "SELECT ArticleID, FullRevisionPKs, RevisionCounter " - + "FROM index_articleID_rc_ts " + "WHERE articleID > " - + this.currentArticleID + " LIMIT " + MAX_NUMBER_RESULTS; + if (resultArticles.next()) { - resultArticles = statement.executeQuery(query); + this.currentArticleID = resultArticles.getInt(1); + return (this.lastArticleID == -1) || (this.currentArticleID <= this.lastArticleID); + } - if (resultArticles.next()) { + return false; + } - this.currentArticleID = resultArticles.getInt(1); - return (this.lastArticleID == -1) - || (this.currentArticleID <= this.lastArticleID); + /** + * Resets the modus to INIT. + */ + private void reset() + { + this.modus = INIT; } - return false; - } - - /** - * Resets the modus to INIT. - */ - private void reset() { - this.modus = INIT; - } - - /** - * Initiates the iteration over of a new article. - * - * @return First Revision - * @throws WikiApiException if an error occurs - */ - private Revision init() throws WikiApiException { - - try { - currentArticleID = resultArticles.getInt(1); - String fullRevisionPKs = resultArticles.getString(2); - String revisionCounters = resultArticles.getString(3); - - int index = revisionCounters.lastIndexOf(' '); - if (index == -1) { - throw new RuntimeException("Invalid revisioncounter content"); - } - - this.maxRevision = Integer.parseInt(revisionCounters.substring( - index + 1, revisionCounters.length())); - - try (Statement statement = this.connection.createStatement(); ResultSet result = statement.executeQuery("SELECT Mapping " - + "FROM index_chronological " + "WHERE ArticleID=" - + currentArticleID + " LIMIT 1")) { - - if (result.next()) { - - this.modus = ITERATE_WITH_MAPPING; - - this.chronoIterator = new ChronoIterator(config, - connection, result.getString(1), fullRevisionPKs, - revisionCounters); - - if (this.chronoIterator.hasNext()) { - return this.chronoIterator.next(); - } else { - throw new RuntimeException("cIt Revision query failed"); - } - - /* - * this.revisionIndex = 1; - * - * revisionEncoder = new RevisionApi(config, connection); - * return revisionEncoder.getRevision(currentArticleID, - * revisionIndex); - */ - - } else { - - this.modus = ITERATE_WITHOUT_MAPPING; - - index = fullRevisionPKs.indexOf(' '); - if (index == -1) { - index = fullRevisionPKs.length(); - } - - int currentPK = Integer.parseInt(fullRevisionPKs.substring( - 0, index)); - - // TODO CHECK! -2 instead of -1 gets rid of the extra - // resivsion from the next article - this.revisionIterator = new RevisionIterator(config, - currentPK, currentPK + maxRevision - 2, - connection); - - if (revisionIterator.hasNext()) { - return revisionIterator.next(); - } else { - throw new RuntimeException("Revision query failed"); - } - } - } + /** + * Initiates the iteration over of a new article. + * + * @return First Revision + * @throws WikiApiException + * if an error occurs + */ + private Revision init() throws WikiApiException + { + + try { + currentArticleID = resultArticles.getInt(1); + String fullRevisionPKs = resultArticles.getString(2); + String revisionCounters = resultArticles.getString(3); + + int index = revisionCounters.lastIndexOf(' '); + if (index == -1) { + throw new RuntimeException("Invalid revisioncounter content"); + } + + this.maxRevision = Integer + .parseInt(revisionCounters.substring(index + 1, revisionCounters.length())); + + try (Statement statement = this.connection.createStatement(); + ResultSet result = statement + .executeQuery("SELECT Mapping " + "FROM index_chronological " + + "WHERE ArticleID=" + currentArticleID + " LIMIT 1")) { + + if (result.next()) { + + this.modus = ITERATE_WITH_MAPPING; + + this.chronoIterator = new ChronoIterator(config, connection, + result.getString(1), fullRevisionPKs, revisionCounters); + + if (this.chronoIterator.hasNext()) { + return this.chronoIterator.next(); + } + else { + throw new RuntimeException("cIt Revision query failed"); + } + + /* + * this.revisionIndex = 1; + * + * revisionEncoder = new RevisionApi(config, connection); return + * revisionEncoder.getRevision(currentArticleID, revisionIndex); + */ + + } + else { + + this.modus = ITERATE_WITHOUT_MAPPING; + + index = fullRevisionPKs.indexOf(' '); + if (index == -1) { + index = fullRevisionPKs.length(); + } + + int currentPK = Integer.parseInt(fullRevisionPKs.substring(0, index)); + + // TODO CHECK! -2 instead of -1 gets rid of the extra + // resivsion from the next article + this.revisionIterator = new RevisionIterator(config, currentPK, + currentPK + maxRevision - 2, connection); + + if (revisionIterator.hasNext()) { + return revisionIterator.next(); + } + else { + throw new RuntimeException("Revision query failed"); + } + } + } - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); + } } - } - - /** - * Returns the next revision. - * - * @return Revision - */ - public Revision next() { - try { - switch (modus) { - case INIT: - return init(); - - case ITERATE_WITH_MAPPING: - return chronoIterator.next(); - - // revisionEncoder.getRevision(currentArticleID, revisionIndex); - - case ITERATE_WITHOUT_MAPPING: - return revisionIterator.next(); - - default: - throw new RuntimeException("Illegal mode"); - } - } catch (Exception e) { - throw new RuntimeException(e); + + /** + * Returns the next revision. + * + * @return Revision + */ + public Revision next() + { + try { + switch (modus) { + case INIT: + return init(); + + case ITERATE_WITH_MAPPING: + return chronoIterator.next(); + + // revisionEncoder.getRevision(currentArticleID, revisionIndex); + + case ITERATE_WITHOUT_MAPPING: + return revisionIterator.next(); + + default: + throw new RuntimeException("Illegal mode"); + } + } + catch (Exception e) { + throw new RuntimeException(e); + } } - } - /** - * Returns whether another revision is available or not. - * - * @return TRUE or FALSE - */ - public boolean hasNext() { + /** + * Returns whether another revision is available or not. + * + * @return TRUE or FALSE + */ + public boolean hasNext() + { - try { - switch (modus) { - case INIT: - return queryArticle(); + try { + switch (modus) { + case INIT: + return queryArticle(); - case ITERATE_WITH_MAPPING: - if (chronoIterator.hasNext()) { - return true; - } + case ITERATE_WITH_MAPPING: + if (chronoIterator.hasNext()) { + return true; + } - reset(); + reset(); - if (resultArticles.next()) { + if (resultArticles.next()) { - this.currentArticleID = resultArticles.getInt(1); - return (this.lastArticleID == -1) - || (this.currentArticleID <= this.lastArticleID); - } + this.currentArticleID = resultArticles.getInt(1); + return (this.lastArticleID == -1) + || (this.currentArticleID <= this.lastArticleID); + } - resultArticles.close(); - return queryArticle(); + resultArticles.close(); + return queryArticle(); - case ITERATE_WITHOUT_MAPPING: + case ITERATE_WITHOUT_MAPPING: - if (revisionIterator.hasNext()) { - return true; - } + if (revisionIterator.hasNext()) { + return true; + } - reset(); + reset(); - if (resultArticles.next()) { + if (resultArticles.next()) { - this.currentArticleID = resultArticles.getInt(1); - return (this.lastArticleID == -1) - || (this.currentArticleID <= this.lastArticleID); - } + this.currentArticleID = resultArticles.getInt(1); + return (this.lastArticleID == -1) + || (this.currentArticleID <= this.lastArticleID); + } - resultArticles.close(); - return queryArticle(); + resultArticles.close(); + return queryArticle(); - default: - throw new RuntimeException("Illegal mode"); - } + default: + throw new RuntimeException("Illegal mode"); + } + + } + catch (SQLException e) { + throw new RuntimeException(e); + } + } - } catch (SQLException e) { - throw new RuntimeException(e); + /** + * This method is unsupported. + * + * @deprecated Do not use as the method will throw an exception at runtime. + */ + @Override + @Deprecated(since = "1.0") + public void remove() + { + throw new UnsupportedOperationException(); } - } - - /** - * This method is unsupported. - * - * @deprecated Do not use as the method will throw an exception at runtime. - */ - @Override - @Deprecated(since = "1.0") - public void remove() { - throw new UnsupportedOperationException(); - } - - /** - * This method closes the connection to the input component. - * - * @throws SQLException if an error occurs while closing the connection to the - * database. - */ - @Override - public void close() throws SQLException { - if (this.connection != null) { - this.connection.close(); + + /** + * This method closes the connection to the input component. + * + * @throws SQLException + * if an error occurs while closing the connection to the database. + */ + @Override + public void close() throws SQLException + { + if (this.connection != null) { + this.connection.close(); + } } - } - public static void main(final String[] args) - throws Exception { + public static void main(final String[] args) throws Exception + { - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - config.setHost("localhost"); - config.setDatabase("en_wiki"); - config.setUser("root"); - config.setPassword("1234"); + config.setHost("localhost"); + config.setDatabase("en_wiki"); + config.setUser("root"); + config.setPassword("1234"); - config.setCharacterSet("UTF-8"); - config.setBufferSize(10000); - config.setMaxAllowedPacket(1024 * 1023); - config.setChronoStorageSpace(400 * 1024 * 1024); + config.setCharacterSet("UTF-8"); + config.setBufferSize(10000); + config.setMaxAllowedPacket(1024 * 1023); + config.setChronoStorageSpace(400 * 1024 * 1024); - long count = 1; - long last = 0, now, start = System.currentTimeMillis(); + long count = 1; + long last = 0, now, start = System.currentTimeMillis(); - Revision rev; - ChronoRevisionIterator it = new ChronoRevisionIterator(config); + Revision rev; + ChronoRevisionIterator it = new ChronoRevisionIterator(config); - System.out.println(Time.toClock(System.currentTimeMillis() - start)); + System.out.println(Time.toClock(System.currentTimeMillis() - start)); - while (it.hasNext()) { - rev = it.next(); + while (it.hasNext()) { + rev = it.next(); - if (count++ % 1000 == 0) { + if (count++ % 1000 == 0) { - now = System.currentTimeMillis() - start; - if (it.chronoIterator != null) { - System.out.println(it.chronoIterator.getStorageSize()); + now = System.currentTimeMillis() - start; + if (it.chronoIterator != null) { + System.out.println(it.chronoIterator.getStorageSize()); + } + if (rev != null) { + System.out.println(rev); + } + System.out + .println(Time.toClock(now) + "\t" + (now - last) + "\tREBUILDING " + count); + last = now; + } } - if (rev != null) { - System.out.println(rev); - } - System.out.println(Time.toClock(now) + "\t" + (now - last) - + "\tREBUILDING " + count); - last = now; - } - } - System.out.println(Time.toClock(System.currentTimeMillis() - start)); - } + System.out.println(Time.toClock(System.currentTimeMillis() - start)); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java index fc74d336..c50fe5ec 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Contributor.java @@ -22,48 +22,58 @@ /** * Provides basic user/contributor information in a single object */ -public class Contributor { - private String name; - private Integer id; - private List<String> groups; +public class Contributor +{ + private String name; + private Integer id; + private List<String> groups; - public Contributor(String name) { - this.name = name; - } + public Contributor(String name) + { + this.name = name; + } - public Contributor(String name, Integer id) { - this.name = name; - this.id = id; - } + public Contributor(String name, Integer id) + { + this.name = name; + this.id = id; + } - public Contributor(String name, Integer id, List<String> groups) { - this.name = name; - this.id = id; - this.groups = groups; - } + public Contributor(String name, Integer id, List<String> groups) + { + this.name = name; + this.id = id; + this.groups = groups; + } - public String getName() { - return name; - } + public String getName() + { + return name; + } - public void setName(String aName) { - name = aName; - } + public void setName(String aName) + { + name = aName; + } - public Integer getId() { - return id; - } + public Integer getId() + { + return id; + } - public void setId(Integer aId) { - id = aId; - } + public void setId(Integer aId) + { + id = aId; + } - public List<String> getGroups() { - return groups; - } + public List<String> getGroups() + { + return groups; + } - public void setGroups(List<String> groups) { - this.groups = groups; - } + public void setGroups(List<String> groups) + { + this.groups = groups; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java index cfa5add3..e7d7a495 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/Revision.java @@ -30,420 +30,473 @@ /** * This class contains all revision data. * <p> - * The revision text is loaded upon first access (lazy loading). - * When serializing a Revision, the revisionText will be loaded first. + * The revision text is loaded upon first access (lazy loading). When serializing a Revision, the + * revisionText will be loaded first. */ -public class Revision implements ISizeable, Comparable<Revision>, RevisionDataInterface, Serializable { - - private static final long serialVersionUID = 7955292965697731279L; - - /** - * ID of the article - */ - private int articleID; - - /** - * Full Revision ID - */ - private int fullRevisionID; - - /** - * Primary Key - */ - private int primaryKey; - - /** - * Revision counter - */ - private final int revisionCounter; - - /** - * ID of the revision - */ - private int revisionId; - - /** - * Content - */ - private String revisionText; - - /** - * Timestamp - */ - private Timestamp timeStamp; - - /** - * Username of the contributor who created this revision - */ - private String contributorName; - - /** - * Username of the contributor who created this revision - */ - private Integer contributorId; - - /** - * The user comment for this revision - */ - private String comment; - - /** - * Determine whether revision is a minor revision - */ - private boolean isMinor = false; - - /** - * Determine whether the contributor was registered. True: contributorName= - * username False: contributorName= IP - */ - private boolean contributorIsRegistered; - - /** - * Reference to RevisionApi - */ - private transient RevisionApi revisionApi; - - // TODO add fields for the revision flags - - /** - * A collection of DiffParts that make up this revision. This can be used to - * get Information about the actions that have been performed to create this - * revision - */ - private Collection<DiffPart> parts; - - /** - * (Constructor) Creates a new Revision object. - * - * @param revisionCounter revision counter - */ - public Revision(final int revisionCounter) { - this.revisionCounter = revisionCounter; - } - - /** - * (Constructor) Creates a new Revision object. - * - * @param revisionCounter revision counter - * @param revisionApi revision API - */ - public Revision(final int revisionCounter, RevisionApi revisionApi) { - this.revisionCounter = revisionCounter; - this.revisionApi = revisionApi; - } - - /** - * Returns the estimated number of bytes used to encode the contained - * information. - * - * @return estimated size in bytes - */ - @Override - public long byteSize() { - if (this.revisionText == null) { - return 0; - } - return this.revisionText.length(); - } - - /** - * Returns the ID of the article. - * - * @return article ID - */ - @Override - public int getArticleID() { - return articleID; - } - - /** - * Returns the full revision ID. - * - * @return full revision ID - */ - public int getFullRevisionID() { - return this.fullRevisionID; - } - - /** - * Returns the primary key. - * - * @return primary key - */ - public int getPrimaryKey() { - return primaryKey; - } - - /* - * (non-Javadoc) - * - * @see java.lang.Comparable#compareTo(java.lang.Object) - */ - @Override - public int compareTo(final Revision r) { - long value = this.timeStamp.getTime() - r.getTimeStamp().getTime(); - - if (value == 0) { - return this.getRevisionID() - r.getRevisionID(); - } else if (value > 0) { - return 1; - } else { - return -1; - } - } - - /** - * Sets the revision api - * - * @param revisionApi api to set - */ - public void setRevisionApi(RevisionApi revisionApi) { - this.revisionApi = revisionApi; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - @Override - public int getRevisionCounter() { - return revisionCounter; - } - - /** - * Returns the ID of the revision. - * - * @return revision ID - */ - @Override - public int getRevisionID() { - return revisionId; - } - - /** - * Returns the textual content of this revision. - * - * @return content - */ - public String getRevisionText() { - if (this.revisionText == null) { - revisionApi.setRevisionTextAndParts(this); - } - return StringEscapeUtils.unescapeHtml4(this.revisionText); - } - - /** - * Returns the timestamp. - * - * @return timestamp - */ - @Override - public Timestamp getTimeStamp() { - return timeStamp; - } - - /** - * Returns a collection of DiffPart objects that make up this revision - * - * @return a collection of DiffPart object that make up this revision - */ - public Collection<DiffPart> getParts() { - if (this.parts == null) { - revisionApi.setRevisionTextAndParts(this); - } - return this.parts; - } - - /** - * Sets the ID of the article. - * - * @param articleID article ID - */ - public void setArticleID(final int articleID) { - this.articleID = articleID; - } - - /** - * Set the ID of the full revision. - * - * @param fullRevisionID full revision ID - */ - public void setFullRevisionID(final int fullRevisionID) { - this.fullRevisionID = fullRevisionID; - } - - /** - * Sets the primary key. - * - * @param primaryKey primary key - */ - public void setPrimaryKey(final int primaryKey) { - this.primaryKey = primaryKey; - } - - /** - * Sets the ID of the revision. - * - * @param revisionId revision ID - */ - public void setRevisionID(final int revisionId) { - this.revisionId = revisionId; - } - - /** - * Sets the revision text. - * - * @param revisionText content - */ - public void setRevisionText(final String revisionText) { - this.revisionText = revisionText; - } - - /** - * Sets the timestamp information. - * <p> - * The input is expected to be the wikipedia version of the timestamp as - * String (YYYY-MM-DDThh-mm-ssZ). T and Z will be replaced with spaces. - * - * @param timeStamp timestamp (wikipedia version) - */ - public void setTimeStamp(final String timeStamp) { - - String time = timeStamp.replace('T', ' '); - time = time.replace('Z', ' '); - - this.timeStamp = Timestamp.valueOf(time); - } - - /** - * Sets the timestamp information. - * - * @param timeStamp timestamp - */ - public void setTimeStamp(final Timestamp timeStamp) { - - this.timeStamp = timeStamp; - } - - /** - * Sets the collection of DiffPart objects that make up this revision - * - * @param parts a collection of DiffPart object that make up this revision - */ - public void setParts(Collection<DiffPart> parts) { - this.parts = parts; - } - - /** - * Returns the string representation of this object. - * - * @return (ArticleID, RevisionCounter, Timestamp, RevisionID, TextLength) - */ - @Override - public String toString() { - - StringBuilder sRep = new StringBuilder(); - sRep.append('('); - sRep.append(articleID); - sRep.append(", "); - sRep.append(revisionCounter); - sRep.append(", "); - sRep.append(timeStamp); - sRep.append(", "); - sRep.append(revisionId); - - if (revisionText != null) { - sRep.append(", "); - sRep.append(revisionText.length()); - } - sRep.append(')'); - - return sRep.toString(); - } - - /** - * Sets the user comment for this revision - * - * @param comment the user comment for this revision - */ - public void setComment(String comment) { - this.comment = comment; - } - - /** - * Returns the user comment for this revision - * - * @return the user comment for this revision - */ - @Override - public String getComment() { - return comment; - } - - public void setMinor(boolean isMinor) { - this.isMinor = isMinor; - } - - @Override - public boolean isMinor() { - return isMinor; - } - - public void setContributorName(String contributorName) { - this.contributorName = contributorName; - } - - @Override - public String getContributorName() { - return contributorName; - } - - public void setContributorIsRegistered(boolean contributorIsRegistered) { - this.contributorIsRegistered = contributorIsRegistered; - } - - @Override - public boolean contributorIsRegistered() { - return contributorIsRegistered; - } - - public void setContributorId(Integer contributorId) { - this.contributorId = contributorId; - } - - @Override - public Integer getContributorId() { - return contributorId; - } - - private void writeObject(ObjectOutputStream out) throws IOException { - //load DiffParts before serializing - getParts(); - //load revision text before serializing - getRevisionText(); - //now we can serialize the object with the default write method - out.defaultWriteObject(); - } - - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - * - * Revisions are equal if their ids are equal - */ - @Override - public boolean equals(Object anObject) { - - if (!(anObject instanceof Revision)) { - return false; - } else { - Revision otherRev = (Revision) anObject; - if (this.getRevisionID() == otherRev.getRevisionID()) { - return true; - } else { - return false; - } - } - } +public class Revision + implements ISizeable, Comparable<Revision>, RevisionDataInterface, Serializable +{ + + private static final long serialVersionUID = 7955292965697731279L; + + /** + * ID of the article + */ + private int articleID; + + /** + * Full Revision ID + */ + private int fullRevisionID; + + /** + * Primary Key + */ + private int primaryKey; + + /** + * Revision counter + */ + private final int revisionCounter; + + /** + * ID of the revision + */ + private int revisionId; + + /** + * Content + */ + private String revisionText; + + /** + * Timestamp + */ + private Timestamp timeStamp; + + /** + * Username of the contributor who created this revision + */ + private String contributorName; + + /** + * Username of the contributor who created this revision + */ + private Integer contributorId; + + /** + * The user comment for this revision + */ + private String comment; + + /** + * Determine whether revision is a minor revision + */ + private boolean isMinor = false; + + /** + * Determine whether the contributor was registered. True: contributorName= username False: + * contributorName= IP + */ + private boolean contributorIsRegistered; + + /** + * Reference to RevisionApi + */ + private transient RevisionApi revisionApi; + + // TODO add fields for the revision flags + + /** + * A collection of DiffParts that make up this revision. This can be used to get Information + * about the actions that have been performed to create this revision + */ + private Collection<DiffPart> parts; + + /** + * (Constructor) Creates a new Revision object. + * + * @param revisionCounter + * revision counter + */ + public Revision(final int revisionCounter) + { + this.revisionCounter = revisionCounter; + } + + /** + * (Constructor) Creates a new Revision object. + * + * @param revisionCounter + * revision counter + * @param revisionApi + * revision API + */ + public Revision(final int revisionCounter, RevisionApi revisionApi) + { + this.revisionCounter = revisionCounter; + this.revisionApi = revisionApi; + } + + /** + * Returns the estimated number of bytes used to encode the contained information. + * + * @return estimated size in bytes + */ + @Override + public long byteSize() + { + if (this.revisionText == null) { + return 0; + } + return this.revisionText.length(); + } + + /** + * Returns the ID of the article. + * + * @return article ID + */ + @Override + public int getArticleID() + { + return articleID; + } + + /** + * Returns the full revision ID. + * + * @return full revision ID + */ + public int getFullRevisionID() + { + return this.fullRevisionID; + } + + /** + * Returns the primary key. + * + * @return primary key + */ + public int getPrimaryKey() + { + return primaryKey; + } + + /* + * (non-Javadoc) + * + * @see java.lang.Comparable#compareTo(java.lang.Object) + */ + @Override + public int compareTo(final Revision r) + { + long value = this.timeStamp.getTime() - r.getTimeStamp().getTime(); + + if (value == 0) { + return this.getRevisionID() - r.getRevisionID(); + } + else if (value > 0) { + return 1; + } + else { + return -1; + } + } + + /** + * Sets the revision api + * + * @param revisionApi + * api to set + */ + public void setRevisionApi(RevisionApi revisionApi) + { + this.revisionApi = revisionApi; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + @Override + public int getRevisionCounter() + { + return revisionCounter; + } + + /** + * Returns the ID of the revision. + * + * @return revision ID + */ + @Override + public int getRevisionID() + { + return revisionId; + } + + /** + * Returns the textual content of this revision. + * + * @return content + */ + public String getRevisionText() + { + if (this.revisionText == null) { + revisionApi.setRevisionTextAndParts(this); + } + return StringEscapeUtils.unescapeHtml4(this.revisionText); + } + + /** + * Returns the timestamp. + * + * @return timestamp + */ + @Override + public Timestamp getTimeStamp() + { + return timeStamp; + } + + /** + * Returns a collection of DiffPart objects that make up this revision + * + * @return a collection of DiffPart object that make up this revision + */ + public Collection<DiffPart> getParts() + { + if (this.parts == null) { + revisionApi.setRevisionTextAndParts(this); + } + return this.parts; + } + + /** + * Sets the ID of the article. + * + * @param articleID + * article ID + */ + public void setArticleID(final int articleID) + { + this.articleID = articleID; + } + + /** + * Set the ID of the full revision. + * + * @param fullRevisionID + * full revision ID + */ + public void setFullRevisionID(final int fullRevisionID) + { + this.fullRevisionID = fullRevisionID; + } + + /** + * Sets the primary key. + * + * @param primaryKey + * primary key + */ + public void setPrimaryKey(final int primaryKey) + { + this.primaryKey = primaryKey; + } + + /** + * Sets the ID of the revision. + * + * @param revisionId + * revision ID + */ + public void setRevisionID(final int revisionId) + { + this.revisionId = revisionId; + } + + /** + * Sets the revision text. + * + * @param revisionText + * content + */ + public void setRevisionText(final String revisionText) + { + this.revisionText = revisionText; + } + + /** + * Sets the timestamp information. + * <p> + * The input is expected to be the wikipedia version of the timestamp as String + * (YYYY-MM-DDThh-mm-ssZ). T and Z will be replaced with spaces. + * + * @param timeStamp + * timestamp (wikipedia version) + */ + public void setTimeStamp(final String timeStamp) + { + + String time = timeStamp.replace('T', ' '); + time = time.replace('Z', ' '); + + this.timeStamp = Timestamp.valueOf(time); + } + + /** + * Sets the timestamp information. + * + * @param timeStamp + * timestamp + */ + public void setTimeStamp(final Timestamp timeStamp) + { + + this.timeStamp = timeStamp; + } + + /** + * Sets the collection of DiffPart objects that make up this revision + * + * @param parts + * a collection of DiffPart object that make up this revision + */ + public void setParts(Collection<DiffPart> parts) + { + this.parts = parts; + } + + /** + * Returns the string representation of this object. + * + * @return (ArticleID, RevisionCounter, Timestamp, RevisionID, TextLength) + */ + @Override + public String toString() + { + + StringBuilder sRep = new StringBuilder(); + sRep.append('('); + sRep.append(articleID); + sRep.append(", "); + sRep.append(revisionCounter); + sRep.append(", "); + sRep.append(timeStamp); + sRep.append(", "); + sRep.append(revisionId); + + if (revisionText != null) { + sRep.append(", "); + sRep.append(revisionText.length()); + } + sRep.append(')'); + + return sRep.toString(); + } + + /** + * Sets the user comment for this revision + * + * @param comment + * the user comment for this revision + */ + public void setComment(String comment) + { + this.comment = comment; + } + + /** + * Returns the user comment for this revision + * + * @return the user comment for this revision + */ + @Override + public String getComment() + { + return comment; + } + + public void setMinor(boolean isMinor) + { + this.isMinor = isMinor; + } + + @Override + public boolean isMinor() + { + return isMinor; + } + + public void setContributorName(String contributorName) + { + this.contributorName = contributorName; + } + + @Override + public String getContributorName() + { + return contributorName; + } + + public void setContributorIsRegistered(boolean contributorIsRegistered) + { + this.contributorIsRegistered = contributorIsRegistered; + } + + @Override + public boolean contributorIsRegistered() + { + return contributorIsRegistered; + } + + public void setContributorId(Integer contributorId) + { + this.contributorId = contributorId; + } + + @Override + public Integer getContributorId() + { + return contributorId; + } + + private void writeObject(ObjectOutputStream out) throws IOException + { + // load DiffParts before serializing + getParts(); + // load revision text before serializing + getRevisionText(); + // now we can serialize the object with the default write method + out.defaultWriteObject(); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + * + * Revisions are equal if their ids are equal + */ + @Override + public boolean equals(Object anObject) + { + + if (!(anObject instanceof Revision)) { + return false; + } + else { + Revision otherRev = (Revision) anObject; + if (this.getRevisionID() == otherRev.getRevisionID()) { + return true; + } + else { + return false; + } + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java index 27ad8db9..b7fadcec 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionAPIConfiguration.java @@ -23,192 +23,216 @@ /** * This class contains the additional parameters for the {@link RevisionApi}. */ -public class RevisionAPIConfiguration extends DatabaseConfiguration { - - /** - * Number of maximum size of an result set - */ - private int bufferSize; - - /** - * Character encoding - */ - private String characterSet; - - /** - * Memory size for the storage of revisions for the chronological iteration - */ - private long chronoStorageSpace; - - /** - * MAX_ALLOWED_PACKET - parameter of the MySQL Server This value indicates - * the maximum size of an sql query. - */ - private long maxAllowedPacket; - - /** - * Path for the IndexGenerator output - */ - private String outputPath; - - /** - * Type of the IndexGenerator output - */ - private OutputTypes outputType; - - /** - * <p>(Constructor) Creates the default configuration.</p> - * OutputType: UNCOMPRESSED (revisionIndex.sql)<br> - */ - public RevisionAPIConfiguration() { - - super(); - this.setHost("localhost"); - - characterSet = "UTF-8"; - maxAllowedPacket = 1024 * 1023; - bufferSize = 10000; - - chronoStorageSpace = 100 * 1024 * 1024; - - outputPath = "revisionIndex.sql"; - outputType = OutputTypes.SQL; - } - - /** - * <p>Creates a (default) RevisionAPIConfiguration from an existing - * DatabaseConfiguration.</p> - * <p> - * OutputType: DATABASE<br> - */ - public RevisionAPIConfiguration(DatabaseConfiguration existingWikiConfig) { - - super(); - - characterSet = "UTF-8"; - maxAllowedPacket = 1024 * 1023; - bufferSize = 10000; - - chronoStorageSpace = 100 * 1024 * 1024; - - outputType = OutputTypes.DATABASE; - - setHost(existingWikiConfig.getHost()); - setDatabase(existingWikiConfig.getDatabase()); - setDatabaseDriver(existingWikiConfig.getDatabaseDriver()); - setJdbcURL(existingWikiConfig.getJdbcURL()); - setUser(existingWikiConfig.getUser()); - setPassword(existingWikiConfig.getPassword()); - setLanguage(existingWikiConfig.getLanguage()); - - } - - - /** - * Returns the maximum size of a result set. - * - * @return maximum size of a result set - */ - public int getBufferSize() { - return bufferSize; - } - - /** - * Returns the character encoding. - * - * @return character encoding - */ - public String getCharacterSet() { - return characterSet; - } - - /** - * Returns the memory size used for the purpose of storing revisions. - * - * @return memory size - */ - public long getChronoStorageSpace() { - return this.chronoStorageSpace; - } - - /** - * Returns the value of MAX_ALLOWED_PACKET parameter. - * - * @return MAX_ALLOWED_PACKET - */ - public long getMaxAllowedPacket() { - return maxAllowedPacket; - } - - /** - * Returns the output path of the index generator. - * - * @return output path - */ - public String getOutputPath() { - return outputPath; - } - - /** - * Returns the output type of the index generator. - * - * @return output type - */ - public OutputTypes getOutputType() { - return outputType; - } - - /** - * Sets the maximum size of a result set. - * - * @param bufferSize maximum size of a result set - */ - public void setBufferSize(final int bufferSize) { - this.bufferSize = bufferSize; - } - - /** - * Sets the character encoding. - * - * @param characterSet character encoding - */ - public void setCharacterSet(final String characterSet) { - this.characterSet = characterSet; - } - - /** - * Set the memory size used for the purpose of storing revisions. - * - * @param chronoStorageSpace memory size result - */ - public void setChronoStorageSpace(final long chronoStorageSpace) { - this.chronoStorageSpace = chronoStorageSpace; - } - - /** - * Sets the value of MAX_ALLOWED_PACKET parameter. - * - * @param maxAllowedPacket MAX_ALLOWED_PACKET - */ - public void setMaxAllowedPacket(final long maxAllowedPacket) { - this.maxAllowedPacket = maxAllowedPacket; - } - - /** - * Sets the output path of the index generator. - * - * @param outputPath output path - */ - public void setOutputPath(final String outputPath) { - this.outputPath = outputPath; - } - - /** - * Sets the output type of the index generator. - * - * @param outputType output type - */ - public void setOutputType(final OutputTypes outputType) { - this.outputType = outputType; - } +public class RevisionAPIConfiguration + extends DatabaseConfiguration +{ + + /** + * Number of maximum size of an result set + */ + private int bufferSize; + + /** + * Character encoding + */ + private String characterSet; + + /** + * Memory size for the storage of revisions for the chronological iteration + */ + private long chronoStorageSpace; + + /** + * MAX_ALLOWED_PACKET - parameter of the MySQL Server This value indicates the maximum size of + * an sql query. + */ + private long maxAllowedPacket; + + /** + * Path for the IndexGenerator output + */ + private String outputPath; + + /** + * Type of the IndexGenerator output + */ + private OutputTypes outputType; + + /** + * <p> + * (Constructor) Creates the default configuration. + * </p> + * OutputType: UNCOMPRESSED (revisionIndex.sql)<br> + */ + public RevisionAPIConfiguration() + { + + super(); + this.setHost("localhost"); + + characterSet = "UTF-8"; + maxAllowedPacket = 1024 * 1023; + bufferSize = 10000; + + chronoStorageSpace = 100 * 1024 * 1024; + + outputPath = "revisionIndex.sql"; + outputType = OutputTypes.SQL; + } + + /** + * <p> + * Creates a (default) RevisionAPIConfiguration from an existing DatabaseConfiguration. + * </p> + * <p> + * OutputType: DATABASE<br> + */ + public RevisionAPIConfiguration(DatabaseConfiguration existingWikiConfig) + { + + super(); + + characterSet = "UTF-8"; + maxAllowedPacket = 1024 * 1023; + bufferSize = 10000; + + chronoStorageSpace = 100 * 1024 * 1024; + + outputType = OutputTypes.DATABASE; + + setHost(existingWikiConfig.getHost()); + setDatabase(existingWikiConfig.getDatabase()); + setDatabaseDriver(existingWikiConfig.getDatabaseDriver()); + setJdbcURL(existingWikiConfig.getJdbcURL()); + setUser(existingWikiConfig.getUser()); + setPassword(existingWikiConfig.getPassword()); + setLanguage(existingWikiConfig.getLanguage()); + + } + + /** + * Returns the maximum size of a result set. + * + * @return maximum size of a result set + */ + public int getBufferSize() + { + return bufferSize; + } + + /** + * Returns the character encoding. + * + * @return character encoding + */ + public String getCharacterSet() + { + return characterSet; + } + + /** + * Returns the memory size used for the purpose of storing revisions. + * + * @return memory size + */ + public long getChronoStorageSpace() + { + return this.chronoStorageSpace; + } + + /** + * Returns the value of MAX_ALLOWED_PACKET parameter. + * + * @return MAX_ALLOWED_PACKET + */ + public long getMaxAllowedPacket() + { + return maxAllowedPacket; + } + + /** + * Returns the output path of the index generator. + * + * @return output path + */ + public String getOutputPath() + { + return outputPath; + } + + /** + * Returns the output type of the index generator. + * + * @return output type + */ + public OutputTypes getOutputType() + { + return outputType; + } + + /** + * Sets the maximum size of a result set. + * + * @param bufferSize + * maximum size of a result set + */ + public void setBufferSize(final int bufferSize) + { + this.bufferSize = bufferSize; + } + + /** + * Sets the character encoding. + * + * @param characterSet + * character encoding + */ + public void setCharacterSet(final String characterSet) + { + this.characterSet = characterSet; + } + + /** + * Set the memory size used for the purpose of storing revisions. + * + * @param chronoStorageSpace + * memory size result + */ + public void setChronoStorageSpace(final long chronoStorageSpace) + { + this.chronoStorageSpace = chronoStorageSpace; + } + + /** + * Sets the value of MAX_ALLOWED_PACKET parameter. + * + * @param maxAllowedPacket + * MAX_ALLOWED_PACKET + */ + public void setMaxAllowedPacket(final long maxAllowedPacket) + { + this.maxAllowedPacket = maxAllowedPacket; + } + + /** + * Sets the output path of the index generator. + * + * @param outputPath + * output path + */ + public void setOutputPath(final String outputPath) + { + this.outputPath = outputPath; + } + + /** + * Sets the output type of the index generator. + * + * @param outputType + * output type + */ + public void setOutputType(final OutputTypes outputType) + { + this.outputType = outputType; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java index 29e32d04..9dc952b3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionApi.java @@ -46,1780 +46,2016 @@ /** * This class can access the database and retrieve single revisions. */ -public class RevisionApi extends AbstractRevisionService { - - /** - * Creates a new {@link RevisionApi} object with an existing database connection. - * - * @param config Reference to the configuration parameters - * @param connection Reference to the database connection - */ - public RevisionApi(final RevisionAPIConfiguration config, final Connection connection) { - this.config = config; - this.connection = connection; - } - - /** - * Creates a new {@link RevisionApi} object. - * - * @param config Reference to the configuration parameters - * @throws WikiApiException if an error occurs - */ - public RevisionApi(final RevisionAPIConfiguration config) throws WikiApiException { - this.config = config; - this.connection = getConnection(config); - } - - /** - * Creates a new {@link RevisionApi} object. - * - * @param dbConfig A database configuration object - * @throws WikiApiException if an error occurs - */ - public RevisionApi(final DatabaseConfiguration dbConfig) throws WikiApiException { - RevisionAPIConfiguration config = new RevisionAPIConfiguration(dbConfig); - this.config = config; - this.connection = getConnection(config); - } - - /** - * Retrieves all article ids for articles with a specified range of revisions (incl. redirects, - * disambiguation pages). <br> - * <b>Attention</b>: When called for the first time, this query needs write-access (ALTER and - * UPDATE) to the database and might take a while to process. - * - * @param minNumberRevisions the smallest number of revisions for an article to be selected - * @param maxNumberRevisions the highest number of revisions for an article to be selected (-1 for infinite) - * @return the set of selected article ids (includes redirects and disambiguation pages) - * @throws WikiApiException if an error occurs - */ - public Set<Integer> getArticleIDsWithNumberOfRevisions(final int minNumberRevisions, - int maxNumberRevisions) throws WikiApiException { - - try { - if (minNumberRevisions < 0) { - throw new IllegalArgumentException("minNumberRevisions needs to be >= 0"); - } - - PreparedStatement statement; - - // check whether the field has already been added - statement = this.connection - .prepareStatement("SELECT * FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = '" - + config.getDatabase() - + "' AND TABLE_NAME = 'index_articleID_rc_ts' AND COLUMN_NAME = 'NumberRevisions'"); - if (!statement.executeQuery().next()) { - // create new column - statement = this.connection - .prepareStatement("ALTER TABLE index_articleID_rc_ts ADD NumberRevisions INT(10) unsigned NOT NULL"); - try { - statement.execute(); - } catch (SQLException e) { - throw new WikiApiException( - "To execute this query for the first time, you need to have write permissions for the database."); - } - // fill with information extracted from RevisionCounter field - statement = this.connection - .prepareStatement("UPDATE index_articleID_rc_ts SET NumberRevisions = (SELECT SUBSTRING_INDEX(RevisionCounter,' ',-1))"); - statement.execute(); - } - - ResultSet result = null; - HashSet<Integer> articles = new HashSet<>(); - - // make query - try { - if (maxNumberRevisions == -1) { - statement = this.connection - .prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " - + "WHERE NumberRevisions >= ?"); - statement.setInt(1, minNumberRevisions); - } else { - statement = this.connection.prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " - + "WHERE NumberRevisions BETWEEN ? AND ?"); - statement.setInt(1, minNumberRevisions); - statement.setInt(2, maxNumberRevisions); - } - result = statement.executeQuery(); - - while (result.next()) { - articles.add(result.getInt(1)); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - return articles; - } catch (Exception e) { - throw new WikiApiException(e); +public class RevisionApi + extends AbstractRevisionService +{ + + /** + * Creates a new {@link RevisionApi} object with an existing database connection. + * + * @param config + * Reference to the configuration parameters + * @param connection + * Reference to the database connection + */ + public RevisionApi(final RevisionAPIConfiguration config, final Connection connection) + { + this.config = config; + this.connection = connection; + } + + /** + * Creates a new {@link RevisionApi} object. + * + * @param config + * Reference to the configuration parameters + * @throws WikiApiException + * if an error occurs + */ + public RevisionApi(final RevisionAPIConfiguration config) throws WikiApiException + { + this.config = config; + this.connection = getConnection(config); } - } - - /** - * Returns the PrimaryKey for the first revision of the given article - * - * @param articleID ID of the article - * @return PK of the first revision - * @throws WikiApiException if an error occurs - * @deprecated To be removed without replacement. - */ - @Deprecated(since="2.0.0", forRemoval=true) - public int getFirstRevisionPK(final int articleID) throws WikiApiException { - - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - String firstRevPK; - - try { - // Retrieve the fullRevisionPK and calculate the limit - statement = this.connection.prepareStatement("SELECT PrimaryKey " - + "FROM revisions " + "WHERE ArticleID=? AND RevisionCounter =1 LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - firstRevPK = result.getString(1); - - } else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return Integer.parseInt(firstRevPK); - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); + + /** + * Creates a new {@link RevisionApi} object. + * + * @param dbConfig + * A database configuration object + * @throws WikiApiException + * if an error occurs + */ + public RevisionApi(final DatabaseConfiguration dbConfig) throws WikiApiException + { + RevisionAPIConfiguration config = new RevisionAPIConfiguration(dbConfig); + this.config = config; + this.connection = getConnection(config); } - } + /** + * Retrieves all article ids for articles with a specified range of revisions (incl. redirects, + * disambiguation pages). <br> + * <b>Attention</b>: When called for the first time, this query needs write-access (ALTER and + * UPDATE) to the database and might take a while to process. + * + * @param minNumberRevisions + * the smallest number of revisions for an article to be selected + * @param maxNumberRevisions + * the highest number of revisions for an article to be selected (-1 for infinite) + * @return the set of selected article ids (includes redirects and disambiguation pages) + * @throws WikiApiException + * if an error occurs + */ + public Set<Integer> getArticleIDsWithNumberOfRevisions(final int minNumberRevisions, + int maxNumberRevisions) + throws WikiApiException + { - /** - * Returns the number of revisions for the specified article. - * - * @param articleID ID of the article - * @return number of revisions - * @throws WikiApiException if an error occurs - */ - public int getNumberOfRevisions(final int articleID) throws WikiApiException { + try { + if (minNumberRevisions < 0) { + throw new IllegalArgumentException("minNumberRevisions needs to be >= 0"); + } + + PreparedStatement statement; + + // check whether the field has already been added + statement = this.connection.prepareStatement( + "SELECT * FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = '" + + config.getDatabase() + + "' AND TABLE_NAME = 'index_articleID_rc_ts' AND COLUMN_NAME = 'NumberRevisions'"); + if (!statement.executeQuery().next()) { + // create new column + statement = this.connection.prepareStatement( + "ALTER TABLE index_articleID_rc_ts ADD NumberRevisions INT(10) unsigned NOT NULL"); + try { + statement.execute(); + } + catch (SQLException e) { + throw new WikiApiException( + "To execute this query for the first time, you need to have write permissions for the database."); + } + // fill with information extracted from RevisionCounter field + statement = this.connection.prepareStatement( + "UPDATE index_articleID_rc_ts SET NumberRevisions = (SELECT SUBSTRING_INDEX(RevisionCounter,' ',-1))"); + statement.execute(); + } + + ResultSet result = null; + HashSet<Integer> articles = new HashSet<>(); + + // make query + try { + if (maxNumberRevisions == -1) { + statement = this.connection + .prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " + + "WHERE NumberRevisions >= ?"); + statement.setInt(1, minNumberRevisions); + } + else { + statement = this.connection + .prepareStatement("SELECT ArticleID FROM index_articleID_rc_ts " + + "WHERE NumberRevisions BETWEEN ? AND ?"); + statement.setInt(1, minNumberRevisions); + statement.setInt(2, maxNumberRevisions); + } + result = statement.executeQuery(); + + while (result.next()) { + articles.add(result.getInt(1)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + return articles; + } + catch (Exception e) { + throw new WikiApiException(e); + } + } - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } + /** + * Returns the PrimaryKey for the first revision of the given article + * + * @param articleID + * ID of the article + * @return PK of the first revision + * @throws WikiApiException + * if an error occurs + * @deprecated To be removed without replacement. + */ + @Deprecated(since = "2.0.0", forRemoval = true) + public int getFirstRevisionPK(final int articleID) throws WikiApiException + { - PreparedStatement statement = null; - ResultSet result = null; - String revCounters; + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - try { - // Retrieve the fullRevisionPK and calculate the limit - statement = this.connection.prepareStatement("SELECT RevisionCounter " - + "FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + PreparedStatement statement = null; + ResultSet result = null; + String firstRevPK; - if (result.next()) { + try { + // Retrieve the fullRevisionPK and calculate the limit + statement = this.connection.prepareStatement("SELECT PrimaryKey " + + "FROM revisions " + "WHERE ArticleID=? AND RevisionCounter =1 LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - revCounters = result.getString(1); + if (result.next()) { + + firstRevPK = result.getString(1); + + } + else { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return Integer.parseInt(firstRevPK); - } else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); } - } finally { - if (statement != null) { - statement.close(); + catch (WikiApiException e) { + throw e; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } - - int index = revCounters.lastIndexOf(' '); - if (index == -1) { - throw new WikiApiException("Article data is inconsistent"); - } + } - return Integer.parseInt(revCounters.substring(index + 1, revCounters.length())); + /** + * Returns the number of revisions for the specified article. + * + * @param articleID + * ID of the article + * @return number of revisions + * @throws WikiApiException + * if an error occurs + */ + public int getNumberOfRevisions(final int articleID) throws WikiApiException + { - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the timestamps of all revisions that have been made before the given revision. - * - * @param articleID ID of the article - * @return List of revisions by each corresponding {@link Timestamp}. - * @throws WikiApiException if an error occurs - */ - public List<Timestamp> getRevisionTimestampsBetweenTimestamps(int articleID, final Timestamp from, final Timestamp to) - throws WikiApiException { - List<Timestamp> timestamps = new LinkedList<>(); - - try { - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - statement = connection - .prepareStatement("SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp >= ? AND Timestamp <= ?"); - statement.setInt(1, articleID); - statement.setLong(2, from.getTime()); - statement.setLong(3, to.getTime()); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - timestamps.add(new Timestamp(result.getLong(1))); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return timestamps; - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } + + PreparedStatement statement = null; + ResultSet result = null; + String revCounters; + + try { + // Retrieve the fullRevisionPK and calculate the limit + statement = this.connection.prepareStatement("SELECT RevisionCounter " + + "FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result.next()) { + + revCounters = result.getString(1); + + } + else { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + int index = revCounters.lastIndexOf(' '); + if (index == -1) { + throw new WikiApiException("Article data is inconsistent"); + } + + return Integer.parseInt(revCounters.substring(index + 1, revCounters.length())); + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); + } } - } - /** - * Returns the timestamps of all revisions that have been made before the given revision. - * - * @param revisionId ID of the revision - * @return List of revisions by each corresponding {@link Timestamp}. - * @throws WikiApiException if an error occurs - */ - public List<Timestamp> getRevisionTimestampsBeforeRevision(final int revisionId) throws WikiApiException { - List<Timestamp> timestamps = new LinkedList<>(); + /** + * Returns the timestamps of all revisions that have been made before the given revision. + * + * @param articleID + * ID of the article + * @return List of revisions by each corresponding {@link Timestamp}. + * @throws WikiApiException + * if an error occurs + */ + public List<Timestamp> getRevisionTimestampsBetweenTimestamps(int articleID, + final Timestamp from, final Timestamp to) + throws WikiApiException + { + List<Timestamp> timestamps = new LinkedList<>(); - int articleID = getPageIdForRevisionId(revisionId); // TODO do this in the SQL query - Timestamp ts = getRevision(revisionId).getTimeStamp(); // TODO do this in the SQL query + try { + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + statement = connection.prepareStatement( + "SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp >= ? AND Timestamp <= ?"); + statement.setInt(1, articleID); + statement.setLong(2, from.getTime()); + statement.setLong(3, to.getTime()); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + while (result.next()) { + timestamps.add(new Timestamp(result.getLong(1))); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return timestamps; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); + } + } - try { - PreparedStatement statement = null; - ResultSet result = null; + /** + * Returns the timestamps of all revisions that have been made before the given revision. + * + * @param revisionId + * ID of the revision + * @return List of revisions by each corresponding {@link Timestamp}. + * @throws WikiApiException + * if an error occurs + */ + public List<Timestamp> getRevisionTimestampsBeforeRevision(final int revisionId) + throws WikiApiException + { + List<Timestamp> timestamps = new LinkedList<>(); + + int articleID = getPageIdForRevisionId(revisionId); // TODO do this in the SQL query + Timestamp ts = getRevision(revisionId).getTimeStamp(); // TODO do this in the SQL query - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); + try { + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + statement = connection.prepareStatement( + "SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp < ?"); + statement.setInt(1, articleID); + statement.setLong(2, ts.getTime()); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + while (result.next()) { + timestamps.add(new Timestamp(result.getLong(1))); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return timestamps; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } + } - statement = connection - .prepareStatement("SELECT Timestamp FROM revisions WHERE ArticleID=? AND Timestamp < ?"); - statement.setInt(1, articleID); - statement.setLong(2, ts.getTime()); - result = statement.executeQuery(); + /** + * Returns the timestamps of all revisions connected to the specified article. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @return collection of timestampp of all revisions + * @throws WikiApiException + * if an error occurs + */ + public List<Timestamp> getRevisionTimestamps(final int articleID) throws WikiApiException + { + + List<Timestamp> timestamps = new LinkedList<>(); - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - timestamps.add(new Timestamp(result.getLong(1))); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } + + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + statement = connection + .prepareStatement("SELECT Timestamp " + "FROM revisions WHERE ArticleID=?"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + while (result.next()) { + + timestamps.add(new Timestamp(result.getLong(1))); + + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return timestamps; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - } - - return timestamps; + } - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); + /** + * Returns the number of unique contributors to an article based on the people who revised the + * article (revision contributors).<br> + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @return the number of unique contributors to the article + * @throws WikiApiException + * if an error occurs + */ + public int getNumberOfUniqueContributors(final int articleID) throws WikiApiException + { + return getNumberOfUniqueContributors(articleID, false); } - } - /** - * Returns the timestamps of all revisions connected to the specified article. - * <p> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @return collection of timestampp of all revisions - * @throws WikiApiException if an error occurs - */ - public List<Timestamp> getRevisionTimestamps(final int articleID) throws WikiApiException { + /** + * Returns the number of unique contributors to an article based on the people who revised the + * article (revision contributors). + * <p> + * It is possible to only count the registered users, if onlyRegistered is set to true <br> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @param onlyRegistered + * defines whether to count only registered users {@code true}, or all users (false) + * @return the number of unique contributors to the article + * @throws WikiApiException + * if an error occurs + */ + public int getNumberOfUniqueContributors(final int articleID, boolean onlyRegistered) + throws WikiApiException + { - List<Timestamp> timestamps = new LinkedList<>(); + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } + + int contrCount = 0; + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + StringBuffer sqlString = new StringBuffer(); + sqlString.append( + "SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=?"); + if (onlyRegistered) { + sqlString.append(" AND ContributorIsRegistered=1"); + } + + statement = connection.prepareStatement(sqlString.toString()); + + statement.setInt(1, articleID); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + + if (result.next()) { + contrCount = result.getInt(1); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return contrCount; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); + } + } - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } + /** + * Returns the number of unique contributors to an article that have contributed before the + * given revision. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param revisionID + * revision before which to count the contributors + * @return the number of unique contributors to the article + * @throws WikiApiException + * if an error occurs + */ + public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID) + throws WikiApiException + { + return getNumberOfUniqueContributorsBeforeRevision(revisionID, false); + } - PreparedStatement statement = null; - ResultSet result = null; + /** + * Returns the number of unique contributors to an article that have contributed before the + * given revision. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param revisionID + * revision before which to count the contributors + * @param onlyRegistered + * defines whether to count only registered users {@code true}, or all users (false) + * @return the number of unique contributors to the article + * @throws WikiApiException + * if an error occurs + */ + public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID, + boolean onlyRegistered) + throws WikiApiException + { - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); + try { + if (revisionID < 1) { + throw new IllegalArgumentException(); + } + + int articleID = getPageIdForRevisionId(revisionID); + Timestamp ts = getRevision(revisionID).getTimeStamp(); + + int contrCount = 0; + PreparedStatement statement = null; + ResultSet result = null; + + try { + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + StringBuffer sqlString = new StringBuffer(); + sqlString.append( + "SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=? AND Timestamp<?"); + if (onlyRegistered) { + sqlString.append(" AND ContributorIsRegistered=1"); + } + + statement = connection.prepareStatement(sqlString.toString()); + + statement.setInt(1, articleID); + statement.setLong(2, ts.getTime()); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + + if (result.next()) { + contrCount = result.getInt(1); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return contrCount; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } + } - statement = connection.prepareStatement("SELECT Timestamp " - + "FROM revisions WHERE ArticleID=?"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + /** + * Returns a map of usernames mapped to the timestamps of their contributions. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException + * if an error occurs + */ + public Map<String, Timestamp> getUserContributionMap(final int articleID) + throws WikiApiException + { + return getUserContributionMap(articleID, null); + } - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { + /** + * Returns a map of usernames mapped to the timestamps of their contributions. + * <p> + * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in + * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or + * empty.<br> + * <br> + * Filtered results also include unregistered users (because they cannot be filtered using user + * groups) In order to get results containing only registered users, use + * {@link #getUserContributionMap(int, String[], boolean)} and set + * {@code onlyRegistered=true}.<br> + * <br> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @param groupfilter + * a list of unwanted user groups + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException + * if an error occurs + */ + public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter) + throws WikiApiException + { + return getUserContributionMap(articleID, groupfilter, false); + } - timestamps.add(new Timestamp(result.getLong(1))); + /** + * Returns a map of usernames mapped to the timestamps of their contributions. <br> + * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in + * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or + * empty.<br> + * <br> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @param groupfilter + * a list of unwanted user groups + * @param onlyRegistered + * {@code true} if result should only contain registered users. {@code false} + * otherwise + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException + * if an error occurs + */ + public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter, + boolean onlyRegistered) + throws WikiApiException + { + + Map<String, Timestamp> authorTSMap = new HashMap<>(); + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } + + PreparedStatement statement = null; + ResultSet result = null; + + try { + + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + StringBuilder statementStr = new StringBuilder(); + + if (groupfilter == null || groupfilter.length < 1 || !tableExists("user_groups")) { + // create statement WITHOUT filter + statementStr.append( + "SELECT ContributorName, Timestamp FROM revisions WHERE ArticleID=?"); + statement = connection.prepareStatement(statementStr.toString()); + statement.setInt(1, articleID); + } + else { + // create statement WITH filter + statementStr.append( + "SELECT ContributorName, Timestamp FROM revisions AS rev, user_groups AS ug WHERE ArticleID=?"); + statementStr.append(" AND rev.ContributorId=ug.ug_user"); + for (String element : groupfilter) { + statementStr.append(" AND NOT ug.ug_group=?"); + } + // and combine with results from unregistered users + if (!onlyRegistered) { + statementStr.append( + " UNION ( SELECT ContributorName, Timestamp FROM revisions AS rev WHERE ArticleID=? AND rev.ContributorId IS NULL)"); + } + + statement = connection.prepareStatement(statementStr.toString()); + // insert article id in prepared statement + statement.setInt(1, articleID); + + // insert filtered groups in prepared statement + int curPrepStatValueIdx = 2; + for (String group : groupfilter) { + statement.setString(curPrepStatValueIdx++, group); + } + if (!onlyRegistered) { + // insert article id for second select in prepared statement + statement.setInt(curPrepStatValueIdx, articleID); + } + + } + + result = statement.executeQuery(); + + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + while (result.next()) { + // Write data from current revision to Map + authorTSMap.put(result.getString(1), new Timestamp(result.getLong(2))); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return authorTSMap; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + } - return timestamps; + /** + * Returns the group assignments of the specified user + * + * @param userID + * ID of the user (NOT THE USERNAME) + * @return collection of user groups + * @throws WikiApiException + * if an error occurs + */ + public List<String> getUserGroups(final int userID) throws WikiApiException + { - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the number of unique contributors to an article based on the people who revised the - * article (revision contributors).<br> - * <p> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @return the number of unique contributors to the article - * @throws WikiApiException if an error occurs - */ - public int getNumberOfUniqueContributors(final int articleID) throws WikiApiException { - return getNumberOfUniqueContributors(articleID, false); - } - - /** - * Returns the number of unique contributors to an article based on the people who revised the - * article (revision contributors). - * <p> - * It is possible to only count the registered users, if onlyRegistered is set to true - * <br> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @param onlyRegistered defines whether to count only registered users {@code true}, or all users (false) - * @return the number of unique contributors to the article - * @throws WikiApiException if an error occurs - */ - public int getNumberOfUniqueContributors(final int articleID, boolean onlyRegistered) throws WikiApiException { - - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - int contrCount = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - StringBuffer sqlString = new StringBuffer(); - sqlString - .append("SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=?"); - if (onlyRegistered) { - sqlString.append(" AND ContributorIsRegistered=1"); - } - - statement = connection.prepareStatement(sqlString.toString()); - - statement.setInt(1, articleID); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - - if (result.next()) { - contrCount = result.getInt(1); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return contrCount; - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the number of unique contributors to an article that have contributed before the - * given revision. - * <p> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param revisionID revision before which to count the contributors - * @return the number of unique contributors to the article - * @throws WikiApiException if an error occurs - */ - public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID) throws WikiApiException { - return getNumberOfUniqueContributorsBeforeRevision(revisionID, false); - } - - /** - * Returns the number of unique contributors to an article that have contributed before the - * given revision. - * <p> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param revisionID revision before which to count the contributors - * @param onlyRegistered defines whether to count only registered users {@code true}, or all users (false) - * @return the number of unique contributors to the article - * @throws WikiApiException if an error occurs - */ - public int getNumberOfUniqueContributorsBeforeRevision(final int revisionID, boolean onlyRegistered) - throws WikiApiException { - - try { - if (revisionID < 1) { - throw new IllegalArgumentException(); - } - - int articleID = getPageIdForRevisionId(revisionID); - Timestamp ts = getRevision(revisionID).getTimeStamp(); - - int contrCount = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - StringBuffer sqlString = new StringBuffer(); - sqlString - .append("SELECT COUNT(DISTINCT ContributorName) FROM revisions WHERE ArticleID=? AND Timestamp<?"); - if (onlyRegistered) { - sqlString.append(" AND ContributorIsRegistered=1"); - } - - statement = connection.prepareStatement(sqlString.toString()); - - statement.setInt(1, articleID); - statement.setLong(2, ts.getTime()); - result = statement.executeQuery(); - - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - - if (result.next()) { - contrCount = result.getInt(1); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return contrCount; - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns a map of usernames mapped to the timestamps of their contributions. - * <p> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @return map of Timestamp-DiffPart-Collection pairs - * @throws WikiApiException if an error occurs - */ - public Map<String, Timestamp> getUserContributionMap(final int articleID) throws WikiApiException { - return getUserContributionMap(articleID, null); - } - - /** - * Returns a map of usernames mapped to the timestamps of their contributions. - * <p> - * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in - * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or empty.<br> - * <br> - * Filtered results also include unregistered users (because they cannot be filtered using user - * groups) In order to get results containing only registered users, use {@link - * #getUserContributionMap(int, String[], boolean)} and set {@code onlyRegistered=true}.<br> - * <br> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @param groupfilter a list of unwanted user groups - * @return map of Timestamp-DiffPart-Collection pairs - * @throws WikiApiException if an error occurs - */ - public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter) - throws WikiApiException { - return getUserContributionMap(articleID, groupfilter, false); - } - - /** - * Returns a map of usernames mapped to the timestamps of their contributions. - * <br> - * Users of certain user groups (e.g. bots) can be filtered by providing the unwanted groups in - * the {@code groupFilter}. Nothing is filtered if the {@code groupFilter} is {@code null} or empty.<br> - * <br> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @param groupfilter a list of unwanted user groups - * @param onlyRegistered {@code true} if result should only contain registered users. {@code false} otherwise - * @return map of Timestamp-DiffPart-Collection pairs - * @throws WikiApiException if an error occurs - */ - public Map<String, Timestamp> getUserContributionMap(final int articleID, String[] groupfilter, - boolean onlyRegistered) throws WikiApiException { - - Map<String, Timestamp> authorTSMap = new HashMap<>(); - - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - - try { - - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - StringBuilder statementStr = new StringBuilder(); - - if (groupfilter == null || groupfilter.length < 1 || !tableExists("user_groups")) { - // create statement WITHOUT filter - statementStr - .append("SELECT ContributorName, Timestamp FROM revisions WHERE ArticleID=?"); - statement = connection.prepareStatement(statementStr.toString()); - statement.setInt(1, articleID); - } else { - // create statement WITH filter - statementStr - .append("SELECT ContributorName, Timestamp FROM revisions AS rev, user_groups AS ug WHERE ArticleID=?"); - statementStr.append(" AND rev.ContributorId=ug.ug_user"); - for (String element : groupfilter) { - statementStr.append(" AND NOT ug.ug_group=?"); - } - // and combine with results from unregistered users - if (!onlyRegistered) { - statementStr.append(" UNION ( SELECT ContributorName, Timestamp FROM revisions AS rev WHERE ArticleID=? AND rev.ContributorId IS NULL)"); - } - - statement = connection.prepareStatement(statementStr.toString()); - // insert article id in prepared statement - statement.setInt(1, articleID); - - // insert filtered groups in prepared statement - int curPrepStatValueIdx = 2; - for (String group : groupfilter) { - statement.setString(curPrepStatValueIdx++, group); - } - if (!onlyRegistered) { - // insert article id for second select in prepared statement - statement.setInt(curPrepStatValueIdx, articleID); - } - - } - - result = statement.executeQuery(); - - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - // Write data from current revision to Map - authorTSMap.put(result.getString(1), new Timestamp(result.getLong(2))); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return authorTSMap; - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } + List<String> groups = new LinkedList<>(); - /** - * Returns the group assignments of the specified user - * - * @param userID ID of the user (NOT THE USERNAME) - * @return collection of user groups - * @throws WikiApiException if an error occurs - */ - public List<String> getUserGroups(final int userID) throws WikiApiException { + try { + if (userID < 1) { + throw new IllegalArgumentException(); + } - List<String> groups = new LinkedList<>(); + if (!tableExists("user_groups")) { + throw new WikiInitializationException( + "User group assignment data is missing. Please download user_groups.sql for this Wikipedia from http://dumps.wikimedia.org and import the data into this database."); + } - try { - if (userID < 1) { - throw new IllegalArgumentException(); - } + PreparedStatement statement = null; + ResultSet result = null; - if (!tableExists("user_groups")) { - throw new WikiInitializationException( - "User group assignment data is missing. Please download user_groups.sql for this Wikipedia from http://dumps.wikimedia.org and import the data into this database."); - } + try { + statement = connection + .prepareStatement("SELECT ug_group FROM user_groups WHERE ug_user=?"); + statement.setInt(1, userID); + result = statement.executeQuery(); - PreparedStatement statement = null; - ResultSet result = null; + // Make the query + if (result == null) { + throw new WikiPageNotFoundException( + "The user with the ID " + userID + " was not found."); + } + while (result.next()) { - try { - statement = connection.prepareStatement("SELECT ug_group FROM user_groups WHERE ug_user=?"); - statement.setInt(1, userID); - result = statement.executeQuery(); + groups.add(result.getString(1)); - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("The user with the ID " + userID - + " was not found."); - } - while (result.next()) { + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - groups.add(result.getString(1)); + return groups; } - } finally { - if (statement != null) { - statement.close(); + catch (WikiApiException e) { + throw e; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } - - return groups; - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); } - } - - /** - * Returns the revisionIds of all revisions created by given user - * - * @param userid id of the user (NOT USER NAME) - * @return Map of revision ids - * @throws WikiApiException if an error occurs - */ - public Map<Integer, List<Integer>> getUserRevisionIds(int userid) throws WikiApiException { - Map<Integer, List<Integer>> revIds = new HashMap<>(); + /** + * Returns the revisionIds of all revisions created by given user + * + * @param userid + * id of the user (NOT USER NAME) + * @return Map of revision ids + * @throws WikiApiException + * if an error occurs + */ + public Map<Integer, List<Integer>> getUserRevisionIds(int userid) throws WikiApiException + { - try { - if (userid < 1) { - throw new IllegalArgumentException(); - } + Map<Integer, List<Integer>> revIds = new HashMap<>(); - if (!indexExists("revisions", "userids")) { - System.err.println("You should create and index for the field ContributorID: create index userids ON revisions(ContributorId(15));"); - } - - PreparedStatement statement = null; - ResultSet result = null; + try { + if (userid < 1) { + throw new IllegalArgumentException(); + } + + if (!indexExists("revisions", "userids")) { + System.err.println( + "You should create and index for the field ContributorID: create index userids ON revisions(ContributorId(15));"); + } + + PreparedStatement statement = null; + ResultSet result = null; + + try { + statement = connection.prepareStatement( + "SELECT ArticleID, RevisionID " + "FROM revisions WHERE ContributorId=?"); + statement.setInt(1, userid); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("No revisions for user " + userid); + } + while (result.next()) { + + int artId = result.getInt(1); + int revId = result.getInt(2); + + if (revIds.containsKey(artId)) { + revIds.get(artId).add(revId); + } + else { + List<Integer> revList = new ArrayList<>(); + revList.add(revId); + revIds.put(artId, revList); + } + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return revIds; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); + } + } - try { - statement = connection.prepareStatement("SELECT ArticleID, RevisionID " - + "FROM revisions WHERE ContributorId=?"); - statement.setInt(1, userid); - result = statement.executeQuery(); + /** + * Returns the revisionIds of all revisions created by given user + * + * @param username + * name of the user (NOT USER ID) + * @return Map of revision ids + * @throws WikiApiException + * if an error occurs + */ + public Map<Integer, List<Integer>> getUserRevisionIds(String username, int limit) + throws WikiApiException + { + + Map<Integer, List<Integer>> revIds = new HashMap<>(); - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("No revisions for user " + userid); + try { + if (username == null || username.isEmpty()) { + throw new IllegalArgumentException(); + } + + if (!indexExists("revisions", "usernames")) { + System.err.println( + "You should create and index for the field ContributorName: create index usernames ON revisions(ContributorName(50));"); + } + + PreparedStatement statement = null; + ResultSet result = null; + + try { + statement = connection.prepareStatement("SELECT ArticleID, RevisionID " + + "FROM revisions WHERE ContributorName=? LIMIT " + limit); + statement.setString(1, username); + result = statement.executeQuery(); + + // Make the query + if (result == null) { + throw new WikiPageNotFoundException("No revisions for user " + username); + } + while (result.next()) { + + int artId = result.getInt(1); + int revId = result.getInt(2); + + if (revIds.containsKey(artId)) { + revIds.get(artId).add(revId); + } + else { + List<Integer> revList = new ArrayList<>(); + revList.add(revId); + revIds.put(artId, revList); + } + + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return revIds; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - while (result.next()) { + } - int artId = result.getInt(1); - int revId = result.getInt(2); + /** + * Returns a map of timestamps mapped on the corresponding DiffPart-Collections. Can be used to + * compile statistics over all changes that have been made in one article. + * <p> + * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the + * revisions-table. + * + * @param articleID + * ID of the article + * @return map of Timestamp-DiffPart-Collection pairs + * @throws WikiApiException + * if an error occurs + */ + public Map<Timestamp, Collection<DiffPart>> getTimestampToRevisionMap(final int articleID) + throws WikiApiException + { + + Map<Timestamp, Collection<DiffPart>> tsDiffPartsMap = new HashMap<>(); - if (revIds.containsKey(artId)) { - revIds.get(artId).add(revId); - } else { - List<Integer> revList = new ArrayList<>(); - revList.add(revId); - revIds.put(artId, revList); - } - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } + + PreparedStatement statement = null; + ResultSet result = null; + RevisionDecoder decoder = new RevisionDecoder(config.getCharacterSet()); + + try { + + // Check if necessary index exists + if (!indexExists("revisions")) { + throw new WikiInitializationException( + "Please create an index on revisions(ArticleID) in order to make this query feasible."); + } + + statement = connection.prepareStatement( + "SELECT Timestamp, Revision " + "FROM revisions WHERE ArticleID=?"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result == null) { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + while (result.next()) { + + // Decode String and create Diff-Object + boolean binaryData = result.getMetaData() + .getColumnType(2) == Types.LONGVARBINARY; + if (binaryData) { + decoder.setInput(result.getBinaryStream(2), true); + } + else { + decoder.setInput(result.getString(2)); + } + Diff diff = decoder.decode(); + + // Get DiffParts from Diff Object + Collection<DiffPart> parts = new LinkedList<>(); + Iterator<DiffPart> it = diff.iterator(); + while (it.hasNext()) { + parts.add(it.next()); + } + + // Write data from current revision to Map + tsDiffPartsMap.put(new Timestamp(result.getLong(1)), parts); + + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return tsDiffPartsMap; + + } + catch (WikiApiException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - } - - return revIds; + } - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); + /** + * Returns the timestamp of the first revision connected to the specified article. + * + * @param articleID + * ID of the article + * @return first date of appearance or the article does not exist + * @throws WikiApiException + * if an error occurs + */ + public Timestamp getFirstDateOfAppearance(final int articleID) throws WikiApiException + { + return getDateOfAppearance(articleID, "FirstAppearance"); } - } - /** - * Returns the revisionIds of all revisions created by given user - * - * @param username name of the user (NOT USER ID) - * @return Map of revision ids - * @throws WikiApiException if an error occurs - */ - public Map<Integer, List<Integer>> getUserRevisionIds(String username, int limit) - throws WikiApiException { + /** + * Returns the timestamp of the last revision connected to the specified article. + * + * @param articleID + * ID of the article + * @return last date of appearance or the article does not exist + * @throws WikiApiException + * if an error occurs + */ + public Timestamp getLastDateOfAppearance(final int articleID) throws WikiApiException + { + return getDateOfAppearance(articleID, "LastAppearance"); + } - Map<Integer, List<Integer>> revIds = new HashMap<>(); + /** + * Returns the timestamp of the first or last revision connected to the specified article. + * + * @param articleID + * ID of the article + * @param firstOrLast + * <code>"FirstAppearance"</code> if first date of appearance should be returned. + * <code>"LastAppearance"</code> if last date of appearance should be returned. + * @return first date of appearance or the article does not exist + * @throws WikiApiException + * if an error occurs + */ + private Timestamp getDateOfAppearance(final int articleID, final String firstOrLast) + throws WikiApiException + { - try { - if (username == null || username.isEmpty()) { - throw new IllegalArgumentException(); - } + try { + if (articleID < 1) { + throw new IllegalArgumentException(); + } - if (!indexExists("revisions", "usernames")) { - System.err.println("You should create and index for the field ContributorName: create index usernames ON revisions(ContributorName(50));"); - } + PreparedStatement statement = null; + ResultSet result = null; + long time; - PreparedStatement statement = null; - ResultSet result = null; + try { + statement = this.connection.prepareStatement("SELECT " + firstOrLast + + " FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - try { - statement = connection.prepareStatement("SELECT ArticleID, RevisionID " - + "FROM revisions WHERE ContributorName=? LIMIT " + limit); - statement.setString(1, username); - result = statement.executeQuery(); + if (result.next()) { - // Make the query - if (result == null) { - throw new WikiPageNotFoundException("No revisions for user " + username); - } - while (result.next()) { + time = result.getLong(1); - int artId = result.getInt(1); - int revId = result.getInt(2); + } + else { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - if (revIds.containsKey(artId)) { - revIds.get(artId).add(revId); - } else { - List<Integer> revList = new ArrayList<>(); - revList.add(revId); - revIds.put(artId, revList); - } + return new Timestamp(time); } - } finally { - if (statement != null) { - statement.close(); + catch (WikiApiException e) { + throw e; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } + } - return revIds; + /** + * Returns the by the id specified revision. + * + * @param revisionID + * ID of the revision + * @return Revision + * @throws WikiApiException + * if an error occurs or the revision does not exists. + */ + public Revision getRevision(final int revisionID) throws WikiApiException + { - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns a map of timestamps mapped on the corresponding DiffPart-Collections. Can be used to - * compile statistics over all changes that have been made in one article. - * <p> - * In order to make this query fast, create a MySQL-Index (BTREE) on the ArticleID in the - * revisions-table. - * - * @param articleID ID of the article - * @return map of Timestamp-DiffPart-Collection pairs - * @throws WikiApiException if an error occurs - */ - public Map<Timestamp, Collection<DiffPart>> getTimestampToRevisionMap(final int articleID) throws WikiApiException { - - Map<Timestamp, Collection<DiffPart>> tsDiffPartsMap = new HashMap<>(); - - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - RevisionDecoder decoder = new RevisionDecoder(config.getCharacterSet()); - - try { - - // Check if necessary index exists - if (!indexExists("revisions")) { - throw new WikiInitializationException( - "Please create an index on revisions(ArticleID) in order to make this query feasible."); - } - - statement = connection.prepareStatement("SELECT Timestamp, Revision " - + "FROM revisions WHERE ArticleID=?"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result == null) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - while (result.next()) { - - // Decode String and create Diff-Object - boolean binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; - if (binaryData) { - decoder.setInput(result.getBinaryStream(2), true); - } else { - decoder.setInput(result.getString(2)); - } - Diff diff = decoder.decode(); - - // Get DiffParts from Diff Object - Collection<DiffPart> parts = new LinkedList<>(); - Iterator<DiffPart> it = diff.iterator(); - while (it.hasNext()) { - parts.add(it.next()); - } - - // Write data from current revision to Map - tsDiffPartsMap.put(new Timestamp(result.getLong(1)), parts); - - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return tsDiffPartsMap; - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the timestamp of the first revision connected to the specified article. - * - * @param articleID ID of the article - * @return first date of appearance or the article does not exist - * @throws WikiApiException if an error occurs - */ - public Timestamp getFirstDateOfAppearance(final int articleID) throws WikiApiException { - return getDateOfAppearance(articleID, "FirstAppearance"); - } - - /** - * Returns the timestamp of the last revision connected to the specified article. - * - * @param articleID ID of the article - * @return last date of appearance or the article does not exist - * @throws WikiApiException if an error occurs - */ - public Timestamp getLastDateOfAppearance(final int articleID) throws WikiApiException { - return getDateOfAppearance(articleID, "LastAppearance"); - } - - /** - * Returns the timestamp of the first or last revision connected to the specified article. - * - * @param articleID ID of the article - * @param firstOrLast <code>"FirstAppearance"</code> if first date of appearance should be returned. - * <code>"LastAppearance"</code> if last date of appearance should be returned. - * @return first date of appearance or the article does not exist - * @throws WikiApiException if an error occurs - */ - private Timestamp getDateOfAppearance(final int articleID, final String firstOrLast) throws WikiApiException { - - try { - if (articleID < 1) { - throw new IllegalArgumentException(); - } - - PreparedStatement statement = null; - ResultSet result = null; - long time; - - try { - statement = this.connection.prepareStatement("SELECT " + firstOrLast - + " FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - time = result.getLong(1); - - } else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - return new Timestamp(time); - - } catch (WikiApiException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } + try { + if (revisionID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the by the id specified revision. - * - * @param revisionID ID of the revision - * @return Revision - * @throws WikiApiException if an error occurs or the revision does not exists. - */ - public Revision getRevision(final int revisionID) throws WikiApiException { + int fullRevPK; + int limit; - try { - if (revisionID < 1) { - throw new IllegalArgumentException(); - } + PreparedStatement statement = null; + ResultSet result = null; - int fullRevPK; - int limit; + try { + statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " + + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); + statement.setInt(1, revisionID); + result = statement.executeQuery(); - PreparedStatement statement = null; - ResultSet result = null; + if (result.next()) { + fullRevPK = result.getInt(1); + limit = (result.getInt(2) - fullRevPK) + 1; - try { - statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " - + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); - statement.setInt(1, revisionID); - result = statement.executeQuery(); + } + else { + throw new WikiPageNotFoundException( + "The revision with the ID " + revisionID + " was not found."); + } - if (result.next()) { - fullRevPK = result.getInt(1); - limit = (result.getInt(2) - fullRevPK) + 1; + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - } else { - throw new WikiPageNotFoundException("The revision with the ID " + revisionID - + " was not found."); - } + return buildRevisionMetaData(fullRevPK, limit); - } finally { - if (statement != null) { - statement.close(); } - if (result != null) { - result.close(); + catch (WikiPageNotFoundException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - } + } - return buildRevisionMetaData(fullRevPK, limit); + /** + * Returns the pageId (ArticleId) for the given revision + * + * @param revisionID + * ID of the revision + * @return the page if for the given revision + * @throws WikiApiException + * if an error occurs or the revision does not exists. + */ + public int getPageIdForRevisionId(final int revisionID) throws WikiApiException + { - } catch (WikiPageNotFoundException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } + try { + if (revisionID < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the pageId (ArticleId) for the given revision - * - * @param revisionID ID of the revision - * @return the page if for the given revision - * @throws WikiApiException if an error occurs or the revision does not exists. - */ - public int getPageIdForRevisionId(final int revisionID) throws WikiApiException { + int pageId; - try { - if (revisionID < 1) { - throw new IllegalArgumentException(); - } + PreparedStatement statement = null; + ResultSet result = null; - int pageId; + try { + statement = this.connection.prepareStatement( + "SELECT r.ArticleID " + "FROM revisions as r, index_revisionID as idx " + + "WHERE idx.RevisionID=? AND idx.RevisionPK=r.PrimaryKey LIMIT 1"); + statement.setInt(1, revisionID); + result = statement.executeQuery(); - PreparedStatement statement = null; - ResultSet result = null; + if (result.next()) { + pageId = result.getInt(1); + } + else { + throw new WikiPageNotFoundException( + "The revision with the ID " + revisionID + " was not found."); + } - try { - statement = this.connection.prepareStatement("SELECT r.ArticleID " - + "FROM revisions as r, index_revisionID as idx " - + "WHERE idx.RevisionID=? AND idx.RevisionPK=r.PrimaryKey LIMIT 1"); - statement.setInt(1, revisionID); - result = statement.executeQuery(); + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - if (result.next()) { - pageId = result.getInt(1); - } else { - throw new WikiPageNotFoundException("The revision with the ID " + revisionID - + " was not found."); - } + return pageId; - } finally { - if (statement != null) { - statement.close(); } - if (result != null) { - result.close(); + catch (WikiPageNotFoundException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - } + } - return pageId; + /** + * Returns the by the article ID and revisionCounter specified revision. Note that this method + * returns the revision in chronological order. + * + * @param articleID + * ID of the article + * @param revisionCounter + * number of revision + * @return Revision + * @throws WikiApiException + * if an error occurs or the revision does not exists. + */ + public Revision getRevision(final int articleID, final int revisionCounter) + throws WikiApiException + { - } catch (WikiPageNotFoundException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } + try { + if (articleID < 1 || revisionCounter < 1) { + throw new IllegalArgumentException(); + } - /** - * Returns the by the article ID and revisionCounter specified revision. Note that this method - * returns the revision in chronological order. - * - * @param articleID ID of the article - * @param revisionCounter number of revision - * @return Revision - * @throws WikiApiException if an error occurs or the revision does not exists. - */ - public Revision getRevision(final int articleID, final int revisionCounter) throws WikiApiException { + int revisionIndex = checkMapping(articleID, revisionCounter); + String fullRevisions, revCounters; - try { - if (articleID < 1 || revisionCounter < 1) { - throw new IllegalArgumentException(); - } + PreparedStatement statement = null; + ResultSet result = null; - int revisionIndex = checkMapping(articleID, revisionCounter); - String fullRevisions, revCounters; + try { + statement = this.connection.prepareStatement( + "SELECT FullRevisionPKs, RevisionCounter FROM index_articleID_rc_ts WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - PreparedStatement statement = null; - ResultSet result = null; + if (result.next()) { - try { - statement = this.connection - .prepareStatement("SELECT FullRevisionPKs, RevisionCounter FROM index_articleID_rc_ts WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); + fullRevisions = result.getString(1); + revCounters = result.getString(2); - if (result.next()) { + } + else { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - fullRevisions = result.getString(1); - revCounters = result.getString(2); + return getReferencedRevision(articleID, revisionIndex, fullRevisions, revCounters); - } else { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " was not found."); } - } finally { - if (statement != null) { - statement.close(); + catch (WikiPageNotFoundException e) { + throw e; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } + } - return getReferencedRevision(articleID, revisionIndex, fullRevisions, revCounters); + /** + * Returns the by the article ID and timestamp specified revision. Note that the timestamp is + * not an unique identifier of a revision related to an article. The returned revision should be + * the first revision that can be found inside the database. + * + * @param articleID + * ID of the article + * @param time + * Timestamp + * @return Revision + * @throws WikiApiException + * if an error occurs or the revision does not exists. + */ + public Revision getRevision(final int articleID, final Timestamp time) throws WikiApiException + { - } catch (WikiPageNotFoundException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns the by the article ID and timestamp specified revision. Note that the timestamp is - * not an unique identifier of a revision related to an article. The returned revision should be - * the first revision that can be found inside the database. - * - * @param articleID ID of the article - * @param time Timestamp - * @return Revision - * @throws WikiApiException if an error occurs or the revision does not exists. - */ - public Revision getRevision(final int articleID, final Timestamp time) throws WikiApiException { - - try { - - PreparedStatement statement = null; - ResultSet result = null; - String fullRevisions; - String revisionCounters; - - if (articleID < 1 || time == null || time.getTime() <= 0) { - throw new IllegalArgumentException(); - } - - int firstPK, lastPK; - try { - statement = this.connection.prepareStatement("SELECT FullRevisionPKs, RevisionCounter," - + " FirstAppearance " + "FROM index_articleID_rc_ts " - + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - fullRevisions = result.getString(1); - revisionCounters = result.getString(2); - long firstDate = result.getLong(3); - - // Find first and last FullRevision PK - int max = fullRevisions.length(); - int index = fullRevisions.indexOf(' '); - if (index == -1) { - index = max; - } - - firstPK = Integer.parseInt(fullRevisions.substring(0, index)); - - index = revisionCounters.lastIndexOf(' ') + 1; - lastPK = firstPK + Integer.parseInt(revisionCounters.substring(index, revisionCounters.length())); - - if (time.getTime() < firstDate) { - throw new WikiPageNotFoundException("No revision before the specified date [" + time + "]"); - } - } else { - throw new WikiPageNotFoundException("The article with the ID " + articleID + " was not found."); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - try { - statement = this.connection.prepareStatement( - "SELECT RevisionCounter FROM revisions WHERE PrimaryKey >= ? AND PrimaryKey < ? AND Timestamp <= ? ORDER BY Timestamp DESC LIMIT 1"); - statement.setInt(1, firstPK); - statement.setInt(2, lastPK); - statement.setLong(3, time.getTime()); - result = statement.executeQuery(); - - if (result.next()) { - int revisionCount = result.getInt(1); - return getReferencedRevision(articleID, revisionCount, fullRevisions, revisionCounters); - } else { - throw new WikiPageNotFoundException("The revision with the specified timestamp was not found."); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } - - } catch (WikiPageNotFoundException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /*--------------------------------------------------------------------------*/ - /* Internal methods */ - /*--------------------------------------------------------------------------*/ - - /** - * This method maps the chronological order to the revisionCounter. - * - * @param articleID ID of the article - * @param revisionCounter chronological position - * @return position in the chronological order - * @throws SQLException if an error occurs while accessing the database. - */ - protected int checkMapping(final int articleID, final int revisionCounter) throws SQLException { - - PreparedStatement statement = null; - ResultSet result = null; - - // Check for the correct revisionCounter mapping - try { - statement = this.connection.prepareStatement("SELECT Mapping " - + "FROM index_chronological " + "WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - String mapping = result.getString(1); - return getMapping(mapping, revisionCounter); - - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + try { - return revisionCounter; - } - - /** - * This method maps the revisionCounter to the chronological order. - * - * @param articleID ID of the article - * @param revisionCounter chronological position - * @return position in the chronological order - * @throws SQLException if an error occurs while accessing the database. - * @deprecated this method should only be used for internal processes - */ - @Deprecated - public int checkReverseMapping(final int articleID, final int revisionCounter) throws SQLException { - - PreparedStatement statement = null; - ResultSet result = null; - - // Check for the correct revisionCounter mapping - try { - statement = this.connection.prepareStatement( - "SELECT ReverseMapping FROM index_chronological WHERE ArticleID=? LIMIT 1"); - statement.setInt(1, articleID); - result = statement.executeQuery(); - - if (result.next()) { - - String mapping = result.getString(1); - return getMapping(mapping, revisionCounter); - - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } + PreparedStatement statement = null; + ResultSet result = null; + String fullRevisions; + String revisionCounters; + + if (articleID < 1 || time == null || time.getTime() <= 0) { + throw new IllegalArgumentException(); + } + + int firstPK, lastPK; + try { + statement = this.connection.prepareStatement( + "SELECT FullRevisionPKs, RevisionCounter," + " FirstAppearance " + + "FROM index_articleID_rc_ts " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); + + if (result.next()) { + + fullRevisions = result.getString(1); + revisionCounters = result.getString(2); + long firstDate = result.getLong(3); + + // Find first and last FullRevision PK + int max = fullRevisions.length(); + int index = fullRevisions.indexOf(' '); + if (index == -1) { + index = max; + } + + firstPK = Integer.parseInt(fullRevisions.substring(0, index)); + + index = revisionCounters.lastIndexOf(' ') + 1; + lastPK = firstPK + Integer + .parseInt(revisionCounters.substring(index, revisionCounters.length())); + + if (time.getTime() < firstDate) { + throw new WikiPageNotFoundException( + "No revision before the specified date [" + time + "]"); + } + } + else { + throw new WikiPageNotFoundException( + "The article with the ID " + articleID + " was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + try { + statement = this.connection.prepareStatement( + "SELECT RevisionCounter FROM revisions WHERE PrimaryKey >= ? AND PrimaryKey < ? AND Timestamp <= ? ORDER BY Timestamp DESC LIMIT 1"); + statement.setInt(1, firstPK); + statement.setInt(2, lastPK); + statement.setLong(3, time.getTime()); + result = statement.executeQuery(); + + if (result.next()) { + int revisionCount = result.getInt(1); + return getReferencedRevision(articleID, revisionCount, fullRevisions, + revisionCounters); + } + else { + throw new WikiPageNotFoundException( + "The revision with the specified timestamp was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + } + catch (WikiPageNotFoundException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); + } } - return revisionCounter; - } + /*--------------------------------------------------------------------------*/ + /* Internal methods */ + /*--------------------------------------------------------------------------*/ + + /** + * This method maps the chronological order to the revisionCounter. + * + * @param articleID + * ID of the article + * @param revisionCounter + * chronological position + * @return position in the chronological order + * @throws SQLException + * if an error occurs while accessing the database. + */ + protected int checkMapping(final int articleID, final int revisionCounter) throws SQLException + { + + PreparedStatement statement = null; + ResultSet result = null; + + // Check for the correct revisionCounter mapping + try { + statement = this.connection.prepareStatement( + "SELECT Mapping " + "FROM index_chronological " + "WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - /** - * This method returns the correct mapping of the given input. - * - * @param mapping mapping sequence - * @param revisionCounter index to map - * @return mapped index - */ - private int getMapping(final String mapping, final int revisionCounter) { + if (result.next()) { - String tempA, tempB; + String mapping = result.getString(1); + return getMapping(mapping, revisionCounter); - int length = 0; - int revC = -1, mapC = -1; - int index, max = mapping.length(); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - while (length < max && revC < revisionCounter) { + return revisionCounter; + } - // Read revisionCounter - index = mapping.indexOf(' ', length); - tempA = mapping.substring(length, index); - length = index + 1; + /** + * This method maps the revisionCounter to the chronological order. + * + * @param articleID + * ID of the article + * @param revisionCounter + * chronological position + * @return position in the chronological order + * @throws SQLException + * if an error occurs while accessing the database. + * @deprecated this method should only be used for internal processes + */ + @Deprecated + public int checkReverseMapping(final int articleID, final int revisionCounter) + throws SQLException + { + + PreparedStatement statement = null; + ResultSet result = null; + + // Check for the correct revisionCounter mapping + try { + statement = this.connection.prepareStatement( + "SELECT ReverseMapping FROM index_chronological WHERE ArticleID=? LIMIT 1"); + statement.setInt(1, articleID); + result = statement.executeQuery(); - // Read mappedCounter - index = mapping.indexOf(' ', length); - if (index == -1) { - index = mapping.length(); - } - tempB = mapping.substring(length, index); - length = index + 1; + if (result.next()) { - // Parse values - revC = Integer.parseInt(tempA); - mapC = Integer.parseInt(tempB); + String mapping = result.getString(1); + return getMapping(mapping, revisionCounter); - // System.out.println(revC + " -> " + mapC); - } + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - if (revC == revisionCounter) { - // System.out.println(revC + " >> " + mapC); - return mapC; + return revisionCounter; } - return revisionCounter; - } - - /** - * This method identifies the correct full revision and retrieves the reference revision. - * - * @param articleID ID of the article - * @param revisionIndex number of revision - * @param fullRevisions list of full revisions - * @param revCounters list of revision counters - * @return Revision - * @throws WikiApiException if an error occurs - */ - private Revision getReferencedRevision(final int articleID, final int revisionIndex, final String fullRevisions, - final String revCounters) throws WikiApiException { - - try { - int fullRevPK; - int limit; - - String fullRev = null; - - int revA = -1, revB = -1; - int lengthFR = 0; - int lengthRC = 0; - int index; - int max = fullRevisions.length(); - - while (lengthFR < max && revB < revisionIndex) { - - // Read fullRevisionPK (as string) - index = fullRevisions.indexOf(' ', lengthFR); - if (index == -1) { - index = max; - } - - fullRev = fullRevisions.substring(lengthFR, index); - lengthFR = index + 1; - - // Read start revision counter - index = revCounters.indexOf(' ', lengthRC); - revA = Integer.parseInt(revCounters.substring(lengthRC, index)); - lengthRC = index + 1; - - // Read end revision counter - index = revCounters.indexOf(' ', lengthRC); - if (index == -1) { - index = revCounters.length(); - } - revB = Integer.parseInt(revCounters.substring(lengthRC, index)); - lengthRC = index + 1; - } - - if (revisionIndex > revB) { - throw new WikiPageNotFoundException("The article with the ID " + articleID - + " has no revision number " + revisionIndex); - } - - fullRevPK = Integer.parseInt(fullRev); - limit = (revisionIndex - revA) + 1; - - // Build the revision - return buildRevisionMetaData(fullRevPK, limit); - - } catch (WikiPageNotFoundException e) { - throw e; - } catch (Exception e) { - throw new WikiApiException(e); - } - } + /** + * This method returns the correct mapping of the given input. + * + * @param mapping + * mapping sequence + * @param revisionCounter + * index to map + * @return mapped index + */ + private int getMapping(final String mapping, final int revisionCounter) + { - /** - * This method queries and builds the specified revision. - * - * @param revision - */ - public void setRevisionTextAndParts(Revision revision) { + String tempA, tempB; - try { + int length = 0; + int revC = -1, mapC = -1; + int index, max = mapping.length(); - PreparedStatement statement = null; - ResultSet result = null; + while (length < max && revC < revisionCounter) { - int fullRevPK; - int limit; - try { - statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " - + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); - statement.setInt(1, revision.getRevisionID()); - result = statement.executeQuery(); + // Read revisionCounter + index = mapping.indexOf(' ', length); + tempA = mapping.substring(length, index); + length = index + 1; - if (result.next()) { - fullRevPK = result.getInt(1); - limit = (result.getInt(2) - fullRevPK) + 1; + // Read mappedCounter + index = mapping.indexOf(' ', length); + if (index == -1) { + index = mapping.length(); + } + tempB = mapping.substring(length, index); + length = index + 1; - } else { - throw new WikiPageNotFoundException("The revision with ID " + revision.getRevisionID() + " was not found."); - } - } finally { - if (statement != null) { - statement.close(); + // Parse values + revC = Integer.parseInt(tempA); + mapC = Integer.parseInt(tempB); + + // System.out.println(revC + " -> " + mapC); } - if (result != null) { - result.close(); + + if (revC == revisionCounter) { + // System.out.println(revC + " >> " + mapC); + return mapC; } - } - try { - statement = this.connection.prepareStatement( - "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " - + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit); - statement.setInt(1, fullRevPK); - result = statement.executeQuery(); + return revisionCounter; + } - String previousRevision = null, currentRevision = null; + /** + * This method identifies the correct full revision and retrieves the reference revision. + * + * @param articleID + * ID of the article + * @param revisionIndex + * number of revision + * @param fullRevisions + * list of full revisions + * @param revCounters + * list of revision counters + * @return Revision + * @throws WikiApiException + * if an error occurs + */ + private Revision getReferencedRevision(final int articleID, final int revisionIndex, + final String fullRevisions, final String revCounters) + throws WikiApiException + { - Diff diff = null; - RevisionDecoder decoder; + try { + int fullRevPK; + int limit; - boolean binaryData = result.getMetaData().getColumnType(1) == Types.LONGVARBINARY; + String fullRev = null; - while (result.next()) { + int revA = -1, revB = -1; + int lengthFR = 0; + int lengthRC = 0; + int index; + int max = fullRevisions.length(); - decoder = new RevisionDecoder(config.getCharacterSet()); + while (lengthFR < max && revB < revisionIndex) { - if (binaryData) { - decoder.setInput(result.getBinaryStream(1), true); - } else { - decoder.setInput(result.getString(1)); - } + // Read fullRevisionPK (as string) + index = fullRevisions.indexOf(' ', lengthFR); + if (index == -1) { + index = max; + } - diff = decoder.decode(); - currentRevision = diff.buildRevision(previousRevision); + fullRev = fullRevisions.substring(lengthFR, index); + lengthFR = index + 1; - previousRevision = currentRevision; - } + // Read start revision counter + index = revCounters.indexOf(' ', lengthRC); + revA = Integer.parseInt(revCounters.substring(lengthRC, index)); + lengthRC = index + 1; - Collection<DiffPart> parts = new LinkedList<>(); - Iterator<DiffPart> it = diff.iterator(); - while (it.hasNext()) { - parts.add(it.next()); - } + // Read end revision counter + index = revCounters.indexOf(' ', lengthRC); + if (index == -1) { + index = revCounters.length(); + } + revB = Integer.parseInt(revCounters.substring(lengthRC, index)); + lengthRC = index + 1; + } + + if (revisionIndex > revB) { + throw new WikiPageNotFoundException("The article with the ID " + articleID + + " has no revision number " + revisionIndex); + } + + fullRevPK = Integer.parseInt(fullRev); + limit = (revisionIndex - revA) + 1; - revision.setParts(parts); - revision.setRevisionText(currentRevision); + // Build the revision + return buildRevisionMetaData(fullRevPK, limit); - } finally { - if (statement != null) { - statement.close(); } - if (result != null) { - result.close(); + catch (WikiPageNotFoundException e) { + throw e; + } + catch (Exception e) { + throw new WikiApiException(e); } - } - } catch (WikiPageNotFoundException | DecodingException | SQLException | IOException e) { - throw new RuntimeException(e); - } catch (RuntimeException e) { - throw e; } - } - - /** - * This method queries and builds the specified revision. - * - * @param fullRevPK PK of the full revision - * @param limit number of revision to query - * @return Revision - * @throws SQLException if an error occurs while retrieving data from the sql database. - */ - private Revision buildRevisionMetaData(final int fullRevPK, final int limit) throws SQLException { - - PreparedStatement statement = null; - ResultSet result = null; - - try { - String query = "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " - + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit; - - /* - * As HSQL does not support ResultSet.last() per default, we have to specify these extra parameters here. - * - * With these parameters in place, the 'last()' call works as expected. - * - * See also: https://stackoverflow.com/q/19533991 - */ - statement = this.connection.prepareStatement(query, ResultSet.TYPE_SCROLL_INSENSITIVE, ResultSet.CONCUR_READ_ONLY); - statement.setInt(1, fullRevPK); - result = statement.executeQuery(); - - Revision revision = null; - - if (result.last()) { - revision = new Revision(result.getInt(3), this); - - revision.setPrimaryKey(result.getInt(2)); - revision.setRevisionID(result.getInt(4)); - revision.setArticleID(result.getInt(5)); - revision.setTimeStamp(new Timestamp(result.getLong(6))); - revision.setComment(result.getString(7)); - revision.setMinor(result.getBoolean(8)); - revision.setContributorName(result.getString(9)); - - // we should not use getInt(), because result may be null - String contribIdString = result.getString(10); - Integer contributorId = contribIdString == null ? null : Integer - .parseInt(contribIdString); - revision.setContributorId(contributorId); - - revision.setContributorIsRegistered(result.getBoolean(11)); - } - return revision; - - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); - } - } + /** + * This method queries and builds the specified revision. + * + * @param revision + */ + public void setRevisionTextAndParts(Revision revision) + { + + try { + + PreparedStatement statement = null; + ResultSet result = null; + + int fullRevPK; + int limit; + try { + statement = this.connection.prepareStatement("SELECT FullRevisionPK, RevisionPK " + + "FROM index_revisionID " + "WHERE revisionID=? LIMIT 1"); + statement.setInt(1, revision.getRevisionID()); + result = statement.executeQuery(); + + if (result.next()) { + fullRevPK = result.getInt(1); + limit = (result.getInt(2) - fullRevPK) + 1; + + } + else { + throw new WikiPageNotFoundException( + "The revision with ID " + revision.getRevisionID() + " was not found."); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + try { + statement = this.connection.prepareStatement( + "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " + + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit); + statement.setInt(1, fullRevPK); + result = statement.executeQuery(); + + String previousRevision = null, currentRevision = null; + + Diff diff = null; + RevisionDecoder decoder; + + boolean binaryData = result.getMetaData().getColumnType(1) == Types.LONGVARBINARY; + + while (result.next()) { + + decoder = new RevisionDecoder(config.getCharacterSet()); + + if (binaryData) { + decoder.setInput(result.getBinaryStream(1), true); + } + else { + decoder.setInput(result.getString(1)); + } + + diff = decoder.decode(); + currentRevision = diff.buildRevision(previousRevision); + + previousRevision = currentRevision; + } + + Collection<DiffPart> parts = new LinkedList<>(); + Iterator<DiffPart> it = diff.iterator(); + while (it.hasNext()) { + parts.add(it.next()); + } + + revision.setParts(parts); + revision.setRevisionText(currentRevision); + + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + } + catch (WikiPageNotFoundException | DecodingException | SQLException | IOException e) { + throw new RuntimeException(e); + } + catch (RuntimeException e) { + throw e; + } - } - - /** - * Checks if some index (besides the PRIMARY-Index) exists in a given table. - * - * @param table the table to check - * @return {@code true} if index exists, false else - * @throws SQLException if an error occurs connecting to or querying the db - */ - private boolean indexExists(String table) throws SQLException { - return indexExists(table, null); - } - - /** - * Checks if an index with a specific name exists in a given table. - * - * @param table the table to check - * @param indexName the name of the index (may be null) - * @return {@code true} if index exists, false else - * @throws SQLException if an error occurs connecting to or querying the db - */ - private boolean indexExists(String table, String indexName) throws SQLException { - - try (PreparedStatement statement = this.connection.prepareStatement("SHOW INDEX FROM " + table - + " WHERE Key_name!= 'PRIMARY'"); ResultSet result = statement.executeQuery()) { - - // Check if an index exists (because otherwise the query would - // be awfully slow. Note that the existence of ANY index will - // suffice - we might want to check for a specific index. - if (result == null || !result.next()) { - return false; - } - - /* - * SOME INDEX EXISTS! We can now check for the existence of a specific index - */ - if (indexName != null) { - // go back to first result - - result.first(); - // check all existing indexes for the specific index name - boolean specificIndexExists = false; - while (result.next()) { - if (result.getString(3).equals(indexName)) { - specificIndexExists = true; - } - } - return specificIndexExists ? true : false; - - } else { - // we have an index, but don't want to check for an index with - // a specific name - - return true; - } } - } + /** + * This method queries and builds the specified revision. + * + * @param fullRevPK + * PK of the full revision + * @param limit + * number of revision to query + * @return Revision + * @throws SQLException + * if an error occurs while retrieving data from the sql database. + */ + private Revision buildRevisionMetaData(final int fullRevPK, final int limit) throws SQLException + { + + PreparedStatement statement = null; + ResultSet result = null; + + try { + String query = "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp, Comment, Minor, ContributorName, ContributorId, ContributorIsRegistered " + + "FROM revisions " + "WHERE PrimaryKey >= ? LIMIT " + limit; + + /* + * As HSQL does not support ResultSet.last() per default, we have to specify these extra + * parameters here. + * + * With these parameters in place, the 'last()' call works as expected. + * + * See also: https://stackoverflow.com/q/19533991 + */ + statement = this.connection.prepareStatement(query, ResultSet.TYPE_SCROLL_INSENSITIVE, + ResultSet.CONCUR_READ_ONLY); + statement.setInt(1, fullRevPK); + result = statement.executeQuery(); + + Revision revision = null; + + if (result.last()) { + revision = new Revision(result.getInt(3), this); + + revision.setPrimaryKey(result.getInt(2)); + revision.setRevisionID(result.getInt(4)); + revision.setArticleID(result.getInt(5)); + revision.setTimeStamp(new Timestamp(result.getLong(6))); + revision.setComment(result.getString(7)); + revision.setMinor(result.getBoolean(8)); + revision.setContributorName(result.getString(9)); + + // we should not use getInt(), because result may be null + String contribIdString = result.getString(10); + Integer contributorId = contribIdString == null ? null + : Integer.parseInt(contribIdString); + revision.setContributorId(contributorId); + + revision.setContributorIsRegistered(result.getBoolean(11)); + } + return revision; + + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - /** - * Checks if a specific table exists. - * - * @param table the table to check - * @return {@code true} if table exists, false else - * @throws SQLException if an error occurs connecting to or querying the db - */ - private boolean tableExists(String table) throws SQLException { + } - try (PreparedStatement statement = this.connection.prepareStatement("SHOW TABLES;"); - ResultSet result = statement.executeQuery()) { + /** + * Checks if some index (besides the PRIMARY-Index) exists in a given table. + * + * @param table + * the table to check + * @return {@code true} if index exists, false else + * @throws SQLException + * if an error occurs connecting to or querying the db + */ + private boolean indexExists(String table) throws SQLException + { + return indexExists(table, null); + } - if (result == null) { - return false; - } - boolean found = false; - while (result.next()) { - if (table.equalsIgnoreCase(result.getString(1))) { - found = true; + /** + * Checks if an index with a specific name exists in a given table. + * + * @param table + * the table to check + * @param indexName + * the name of the index (may be null) + * @return {@code true} if index exists, false else + * @throws SQLException + * if an error occurs connecting to or querying the db + */ + private boolean indexExists(String table, String indexName) throws SQLException + { + + try (PreparedStatement statement = this.connection + .prepareStatement("SHOW INDEX FROM " + table + " WHERE Key_name!= 'PRIMARY'"); + ResultSet result = statement.executeQuery()) { + + // Check if an index exists (because otherwise the query would + // be awfully slow. Note that the existence of ANY index will + // suffice - we might want to check for a specific index. + if (result == null || !result.next()) { + return false; + } + + /* + * SOME INDEX EXISTS! We can now check for the existence of a specific index + */ + if (indexName != null) { + // go back to first result + + result.first(); + // check all existing indexes for the specific index name + boolean specificIndexExists = false; + while (result.next()) { + if (result.getString(3).equals(indexName)) { + specificIndexExists = true; + } + } + return specificIndexExists ? true : false; + + } + else { + // we have an index, but don't want to check for an index with + // a specific name + + return true; + } } - } - return found; } - } + /** + * Checks if a specific table exists. + * + * @param table + * the table to check + * @return {@code true} if table exists, false else + * @throws SQLException + * if an error occurs connecting to or querying the db + */ + private boolean tableExists(String table) throws SQLException + { + + try (PreparedStatement statement = this.connection.prepareStatement("SHOW TABLES;"); + ResultSet result = statement.executeQuery()) { + + if (result == null) { + return false; + } + boolean found = false; + while (result.next()) { + if (table.equalsIgnoreCase(result.getString(1))) { + found = true; + } + } + return found; - @Deprecated(since = "1.1", forRemoval = true) - public RevisionAPIConfiguration getRevisionApiConfiguration() { - return this.config; - } + } - public Connection getConnection() { - return this.connection; - } + } - @Deprecated(since = "1.1", forRemoval = true) - // TODO This should go into a demo or test class separated from the code here... - public static void main(String[] args) throws Exception { + @Deprecated(since = "1.1", forRemoval = true) + public RevisionAPIConfiguration getRevisionApiConfiguration() + { + return this.config; + } - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + public Connection getConnection() + { + return this.connection; + } - config.setHost("localhost"); - config.setDatabase("en_wiki"); - config.setUser("root"); - config.setPassword("1234"); + @Deprecated(since = "1.1", forRemoval = true) + // TODO This should go into a demo or test class separated from the code here... + public static void main(String[] args) throws Exception + { - config.setCharacterSet("UTF-8"); - config.setBufferSize(20000); - config.setMaxAllowedPacket(1024 * 1024); + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - RevisionApi rev = new RevisionApi(config); + config.setHost("localhost"); + config.setDatabase("en_wiki"); + config.setUser("root"); + config.setPassword("1234"); - Revision r; + config.setCharacterSet("UTF-8"); + config.setBufferSize(20000); + config.setMaxAllowedPacket(1024 * 1024); - // System.out.println(rev.getNumberOfRevisions(12)); - // System.out.println(rev.getFirstDateOfAppearance(12)); - // System.out.println(rev.getLastDateOfAppearance(12)); + RevisionApi rev = new RevisionApi(config); - // r = rev.getRevision(31596, new Timestamp(1011743960000l)); - r = rev.getRevision(233181); + Revision r; - System.out.println(r.toString() + "\t" + r.getRevisionText()); - // System.out.println(rev.getRevision(979005).getRevisionText()); - // System.out.println(rev.getRevision(2, new - // Timestamp(1216747716000l)).getRevisionText()); + // System.out.println(rev.getNumberOfRevisions(12)); + // System.out.println(rev.getFirstDateOfAppearance(12)); + // System.out.println(rev.getLastDateOfAppearance(12)); - } + // r = rev.getRevision(31596, new Timestamp(1011743960000l)); + r = rev.getRevision(233181); + + System.out.println(r.toString() + "\t" + r.getRevisionText()); + // System.out.println(rev.getRevision(979005).getRevisionText()); + // System.out.println(rev.getRevision(2, new + // Timestamp(1216747716000l)).getRevisionText()); + + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java index 534cc71b..3b2f9e4c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionDataInterface.java @@ -22,70 +22,72 @@ /** * This interface contains method to access the additional data of a revision. */ -public interface RevisionDataInterface { +public interface RevisionDataInterface +{ - /** - * Returns the ID of the article. - * - * @return ID of the article - */ - int getArticleID(); + /** + * Returns the ID of the article. + * + * @return ID of the article + */ + int getArticleID(); - /** - * Returns the ID of the revision. - * - * @return ID of the revision - */ - int getRevisionID(); + /** + * Returns the ID of the revision. + * + * @return ID of the revision + */ + int getRevisionID(); - /** - * Returns the timestamp - * - * @return timestamp - */ - Timestamp getTimeStamp(); + /** + * Returns the timestamp + * + * @return timestamp + */ + Timestamp getTimeStamp(); - /** - * Returns the revision counter - * - * @return revision counter - */ - int getRevisionCounter(); + /** + * Returns the revision counter + * + * @return revision counter + */ + int getRevisionCounter(); - /** - * Returns the user comment for this revision - * - * @return the user comment for this revision - */ - String getComment(); + /** + * Returns the user comment for this revision + * + * @return the user comment for this revision + */ + String getComment(); - /** - * Returns true if revision is a minor revision. - * - * @return true if revision is a minor revision, false else - */ - boolean isMinor(); + /** + * Returns true if revision is a minor revision. + * + * @return true if revision is a minor revision, false else + */ + boolean isMinor(); - /** - * Returns the contributorID of the revision contributor - * Unregistered users do not have an id, so the return value might be null. - * - * @return the contributorID of the revision contributor or null, if user does not have an id (= is not registered) - */ - Integer getContributorId(); + /** + * Returns the contributorID of the revision contributor Unregistered users do not have an id, + * so the return value might be null. + * + * @return the contributorID of the revision contributor or null, if user does not have an id (= + * is not registered) + */ + Integer getContributorId(); - /** - * Returns the contributorName of the revision contributor - * - * @return the contributorName of the revision contributor - */ - String getContributorName(); + /** + * Returns the contributorName of the revision contributor + * + * @return the contributorName of the revision contributor + */ + String getContributorName(); - /** - * Returns true, if the contributor is a registered user - * - * @return true, if the contributor is a registered user, false else - */ - boolean contributorIsRegistered(); + /** + * Returns true, if the contributor is a registered user + * + * @return true, if the contributor is a registered user, false else + */ + boolean contributorIsRegistered(); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java index d3f0a02d..54f11ffc 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIterator.java @@ -40,409 +40,451 @@ * <p> * This class represents the interface to iterate through multiple revisions. */ -public class RevisionIterator extends AbstractRevisionService implements RevisionIteratorInterface { - - private static final Logger logger = LoggerFactory.getLogger(RevisionIterator.class); - - /** - * Reference to the ResultSet - */ - private ResultSet result; - - /** - * Reference to the Statement - */ - private PreparedStatement statement; - - /** - * Binary Data Flag - */ - private boolean binaryData; - - /** - * Text of the previous revision - */ - private String previousRevision; - - /** - * Current primary key - */ - private int primaryKey; - - /** - * Primary key indicating the end of the data - */ - private int endPK; - - /** - * ID of the current article - */ - private int currentArticleID; - - /** - * The last known revision counter - */ - private int currentRevCounter; - - /** - * Configuration parameter - indicates the maximum size of a query. - */ - private final int MAX_NUMBER_RESULTS; - - /** - * Should load revision text? - */ - private boolean shouldLoadRevisionText; - - /** - * The revision-api for this iterator - used by the Revision object - * in case of lazy loading - */ - private RevisionApi revApi = null; - - public boolean shouldLoadRevisionText() { - return shouldLoadRevisionText; - } - - public void setShouldLoadRevisionText(boolean shouldLoadRevisionText) { - this.shouldLoadRevisionText = shouldLoadRevisionText; - } - - /** - * Creates a new RevisionIterator object. - * - * @param config Reference to the configuration object - * @param startPK Start index - * @param endPK End index - * @param connection Reference to the connection - * @throws WikiApiException if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, final int startPK, final int endPK, - final Connection connection) throws WikiApiException { - - if (startPK < 0 || endPK < 0 || startPK > endPK || connection == null) { - throw new IllegalArgumentException("Illegal argument"); +public class RevisionIterator + extends AbstractRevisionService + implements RevisionIteratorInterface +{ + + private static final Logger logger = LoggerFactory.getLogger(RevisionIterator.class); + + /** + * Reference to the ResultSet + */ + private ResultSet result; + + /** + * Reference to the Statement + */ + private PreparedStatement statement; + + /** + * Binary Data Flag + */ + private boolean binaryData; + + /** + * Text of the previous revision + */ + private String previousRevision; + + /** + * Current primary key + */ + private int primaryKey; + + /** + * Primary key indicating the end of the data + */ + private int endPK; + + /** + * ID of the current article + */ + private int currentArticleID; + + /** + * The last known revision counter + */ + private int currentRevCounter; + + /** + * Configuration parameter - indicates the maximum size of a query. + */ + private final int MAX_NUMBER_RESULTS; + + /** + * Should load revision text? + */ + private boolean shouldLoadRevisionText; + + /** + * The revision-api for this iterator - used by the Revision object in case of lazy loading + */ + private RevisionApi revApi = null; + + public boolean shouldLoadRevisionText() + { + return shouldLoadRevisionText; } - this.primaryKey = startPK - 1; - this.endPK = endPK; - this.config = config; - - this.currentArticleID = -1; - this.currentRevCounter = -1; + public void setShouldLoadRevisionText(boolean shouldLoadRevisionText) + { + this.shouldLoadRevisionText = shouldLoadRevisionText; + } - MAX_NUMBER_RESULTS = config.getBufferSize(); + /** + * Creates a new RevisionIterator object. + * + * @param config + * Reference to the configuration object + * @param startPK + * Start index + * @param endPK + * End index + * @param connection + * Reference to the connection + * @throws WikiApiException + * if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, final int startPK, + final int endPK, final Connection connection) + throws WikiApiException + { + + if (startPK < 0 || endPK < 0 || startPK > endPK || connection == null) { + throw new IllegalArgumentException("Illegal argument"); + } - this.connection = connection; - } + this.primaryKey = startPK - 1; + this.endPK = endPK; + this.config = config; - /** - * Creates a new RevisionIterator object. - * - * @param config Reference to the configuration object - * @param startPK Start index - * @throws WikiApiException if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, final int startPK) throws WikiApiException { + this.currentArticleID = -1; + this.currentRevCounter = -1; - this(config); + MAX_NUMBER_RESULTS = config.getBufferSize(); - if (startPK < 0) { - throw new IllegalArgumentException("Illegal argument"); + this.connection = connection; } - this.primaryKey = startPK - 1; - } - - /** - * Creates a new RevisionIterator object. - * - * @param config Reference to the configuration object - * @param startPK Start index - * @param endPK End index - * @throws WikiApiException if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, final int startPK, final int endPK) - throws WikiApiException { - - this(config, startPK); + /** + * Creates a new RevisionIterator object. + * + * @param config + * Reference to the configuration object + * @param startPK + * Start index + * @throws WikiApiException + * if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, final int startPK) + throws WikiApiException + { + + this(config); + + if (startPK < 0) { + throw new IllegalArgumentException("Illegal argument"); + } - if (endPK < 0 || startPK > endPK) { - throw new IllegalArgumentException("Illegal argument"); + this.primaryKey = startPK - 1; } - this.endPK = endPK; - } - - /** - * Creates a new RevisionIterator object. - * - * @param config Reference to the configuration object - * @throws WikiApiException if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config) throws WikiApiException { - - this.config = config; - this.primaryKey = -1; - this.endPK = Integer.MAX_VALUE; - - this.statement = null; - this.result = null; - this.previousRevision = null; - MAX_NUMBER_RESULTS = config.getBufferSize(); - - connection = getConnection(config); - } - - /** - * Creates a new RevisionIterator object. - * - * @param config Reference to the configuration object - * @param shouldLoadRevisionText should load revision text - * @throws WikiApiException if an error occurs - */ - public RevisionIterator(final RevisionAPIConfiguration config, boolean shouldLoadRevisionText) - throws WikiApiException { - this(config); - this.shouldLoadRevisionText = shouldLoadRevisionText; - } - - public RevisionIterator(final DatabaseConfiguration db) throws WikiApiException { - this(getRevisionAPIConfig(db)); - } - - private static RevisionAPIConfiguration getRevisionAPIConfig(final DatabaseConfiguration db) { - RevisionAPIConfiguration revAPIConfig = new RevisionAPIConfiguration(); - - revAPIConfig.setHost(db.getHost()); - revAPIConfig.setDatabase(db.getDatabase()); - revAPIConfig.setDatabaseDriver(db.getDatabaseDriver()); - revAPIConfig.setJdbcURL(db.getJdbcURL()); - revAPIConfig.setUser(db.getUser()); - revAPIConfig.setPassword(db.getPassword()); - revAPIConfig.setLanguage(db.getLanguage()); - - return revAPIConfig; - } - - /** - * Sends the query to the database and stores the result. The {@link java.sql.Statement} and - * {@link ResultSet} connection will not be closed. - * - * @return {@code true}, if the result set has another element {@code false}, otherwise - * @throws SQLException if an error occurs while accessing the database. - */ - private boolean query() throws SQLException { - String query = "SELECT PrimaryKey, Revision, RevisionCounter," - + " RevisionID, ArticleID, Timestamp, FullRevisionID, ContributorName, ContributorId, Comment, Minor, ContributorIsRegistered " - + "FROM revisions"; - - if (primaryKey > 0) { - query += " WHERE PrimaryKey > " + primaryKey; - } + /** + * Creates a new RevisionIterator object. + * + * @param config + * Reference to the configuration object + * @param startPK + * Start index + * @param endPK + * End index + * @throws WikiApiException + * if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, final int startPK, + final int endPK) + throws WikiApiException + { + + this(config, startPK); + + if (endPK < 0 || startPK > endPK) { + throw new IllegalArgumentException("Illegal argument"); + } - if (MAX_NUMBER_RESULTS > 0) { - query += " LIMIT "; + this.endPK = endPK; + } - if (primaryKey + MAX_NUMBER_RESULTS > endPK) { - query += (endPK - primaryKey + 1); // TODO: +1 ? - } else { - query += MAX_NUMBER_RESULTS; - } + /** + * Creates a new RevisionIterator object. + * + * @param config + * Reference to the configuration object + * @throws WikiApiException + * if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config) throws WikiApiException + { + + this.config = config; + this.primaryKey = -1; + this.endPK = Integer.MAX_VALUE; + + this.statement = null; + this.result = null; + this.previousRevision = null; + MAX_NUMBER_RESULTS = config.getBufferSize(); - } else if (endPK != Integer.MAX_VALUE) { - query += " LIMIT " + (endPK - primaryKey + 1); + connection = getConnection(config); } - try { - statement = this.connection.prepareStatement(query); - result = statement.executeQuery(); - } catch (Exception e) { - logger.error(e.getLocalizedMessage(), e); - try { - boolean connectionReady = !connection.isClosed() && connection.isValid(5); - logger.debug("Connection ready: {}", connectionReady); - if (!connectionReady) { - connection = getConnection(config); - } - statement = this.connection.prepareStatement(query); - result = statement.executeQuery(query); - } catch (WikiApiException wae) { - logger.error(wae.getLocalizedMessage(), wae); - } + /** + * Creates a new RevisionIterator object. + * + * @param config + * Reference to the configuration object + * @param shouldLoadRevisionText + * should load revision text + * @throws WikiApiException + * if an error occurs + */ + public RevisionIterator(final RevisionAPIConfiguration config, boolean shouldLoadRevisionText) + throws WikiApiException + { + this(config); + this.shouldLoadRevisionText = shouldLoadRevisionText; } - - if (result.next()) { - binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; - return true; + public RevisionIterator(final DatabaseConfiguration db) throws WikiApiException + { + this(getRevisionAPIConfig(db)); } - return false; - } + private static RevisionAPIConfiguration getRevisionAPIConfig(final DatabaseConfiguration db) + { + RevisionAPIConfiguration revAPIConfig = new RevisionAPIConfiguration(); - /** - * Returns the next revision. - * - * @return next revision - */ - @Override - public Revision next() { - try { + revAPIConfig.setHost(db.getHost()); + revAPIConfig.setDatabase(db.getDatabase()); + revAPIConfig.setDatabaseDriver(db.getDatabaseDriver()); + revAPIConfig.setJdbcURL(db.getJdbcURL()); + revAPIConfig.setUser(db.getUser()); + revAPIConfig.setPassword(db.getPassword()); + revAPIConfig.setLanguage(db.getLanguage()); - int revCount, articleID; + return revAPIConfig; + } - revCount = result.getInt(3); - articleID = result.getInt(5); + /** + * Sends the query to the database and stores the result. The {@link java.sql.Statement} and + * {@link ResultSet} connection will not be closed. + * + * @return {@code true}, if the result set has another element {@code false}, otherwise + * @throws SQLException + * if an error occurs while accessing the database. + */ + private boolean query() throws SQLException + { + String query = "SELECT PrimaryKey, Revision, RevisionCounter," + + " RevisionID, ArticleID, Timestamp, FullRevisionID, ContributorName, ContributorId, Comment, Minor, ContributorIsRegistered " + + "FROM revisions"; + + if (primaryKey > 0) { + query += " WHERE PrimaryKey > " + primaryKey; + } - if (articleID != this.currentArticleID) { - this.currentRevCounter = 0; - this.currentArticleID = articleID; - } + if (MAX_NUMBER_RESULTS > 0) { + query += " LIMIT "; - if (revCount - 1 != this.currentRevCounter) { + if (primaryKey + MAX_NUMBER_RESULTS > endPK) { + query += (endPK - primaryKey + 1); // TODO: +1 ? + } + else { + query += MAX_NUMBER_RESULTS; + } - logger.error("Invalid RevCounter -" + " [ArticleId " - + articleID + ", RevisionId " + result.getInt(4) - + ", RevisionCounter " + revCount + "] - Expected: " - + (this.currentRevCounter + 1)); + } + else if (endPK != Integer.MAX_VALUE) { + query += " LIMIT " + (endPK - primaryKey + 1); + } - this.currentRevCounter = revCount; - this.previousRevision = null; + try { + statement = this.connection.prepareStatement(query); + result = statement.executeQuery(); + } + catch (Exception e) { + logger.error(e.getLocalizedMessage(), e); + try { + boolean connectionReady = !connection.isClosed() && connection.isValid(5); + logger.debug("Connection ready: {}", connectionReady); + if (!connectionReady) { + connection = getConnection(config); + } + statement = this.connection.prepareStatement(query); + result = statement.executeQuery(query); + } + catch (WikiApiException wae) { + logger.error(wae.getLocalizedMessage(), wae); + } + } - return null; - } + if (result.next()) { + binaryData = result.getMetaData().getColumnType(2) == Types.LONGVARBINARY; + return true; + } - this.currentRevCounter = revCount; - this.primaryKey = result.getInt(1); + return false; + } - Revision revision = new Revision(revCount); - revision.setPrimaryKey(this.primaryKey); - if (!shouldLoadRevisionText) { - String currentRevision; + /** + * Returns the next revision. + * + * @return next revision + */ + @Override + public Revision next() + { + try { - Diff diff; - RevisionDecoder decoder = new RevisionDecoder( - config.getCharacterSet()); + int revCount, articleID; + + revCount = result.getInt(3); + articleID = result.getInt(5); + + if (articleID != this.currentArticleID) { + this.currentRevCounter = 0; + this.currentArticleID = articleID; + } + + if (revCount - 1 != this.currentRevCounter) { + + logger.error("Invalid RevCounter -" + " [ArticleId " + articleID + ", RevisionId " + + result.getInt(4) + ", RevisionCounter " + revCount + "] - Expected: " + + (this.currentRevCounter + 1)); + + this.currentRevCounter = revCount; + this.previousRevision = null; + + return null; + } + + this.currentRevCounter = revCount; + this.primaryKey = result.getInt(1); + + Revision revision = new Revision(revCount); + revision.setPrimaryKey(this.primaryKey); + if (!shouldLoadRevisionText) { + String currentRevision; + + Diff diff; + RevisionDecoder decoder = new RevisionDecoder(config.getCharacterSet()); + + if (binaryData) { + decoder.setInput(result.getBinaryStream(2), true); + } + else { + decoder.setInput(result.getString(2)); + } + diff = decoder.decode(); + + try { + currentRevision = diff.buildRevision(previousRevision); + } + catch (Exception e) { + this.previousRevision = null; + logger.error("Reconstruction failed -" + " [ArticleId " + result.getInt(5) + + ", RevisionId " + result.getInt(4) + ", RevisionCounter " + + result.getInt(3) + "]"); + return null; + } + + previousRevision = currentRevision; + revision.setRevisionText(currentRevision); + } + else { + if (revApi == null) { + revApi = new RevisionApi(config); + } + revision.setRevisionApi(revApi); + } + + revision.setRevisionID(result.getInt(4)); + revision.setArticleID(articleID); + revision.setTimeStamp(new Timestamp(result.getLong(6))); + revision.setFullRevisionID(result.getInt(7)); + revision.setContributorName(result.getString(8)); + revision.setContributorId(result.getInt(9)); + revision.setComment(result.getString(10)); + revision.setMinor(result.getBoolean(11)); + revision.setContributorIsRegistered(result.getBoolean(12)); + + return revision; - if (binaryData) { - decoder.setInput(result.getBinaryStream(2), true); - } else { - decoder.setInput(result.getString(2)); } - diff = decoder.decode(); + catch (DecodingException | SQLException | IOException | WikiApiException e) { + throw new RuntimeException(e); + } + } + /** + * Returns whether another revision is available or not. + */ + @Override + public boolean hasNext() + { try { - currentRevision = diff.buildRevision(previousRevision); - } catch (Exception e) { - this.previousRevision = null; - logger.error("Reconstruction failed -" - + " [ArticleId " + result.getInt(5) - + ", RevisionId " + result.getInt(4) - + ", RevisionCounter " + result.getInt(3) + "]"); - return null; - } + if (result != null && result.next()) { + return true; + } + + // Close old queries + if (this.statement != null) { + this.statement.close(); + } + if (this.result != null) { + this.result.close(); + } + + if (primaryKey <= endPK) { // TODO: <= ? + return query(); + } + + return false; - previousRevision = currentRevision; - revision.setRevisionText(currentRevision); - } else { - if (revApi == null) { - revApi = new RevisionApi(config); } - revision.setRevisionApi(revApi); - } - - revision.setRevisionID(result.getInt(4)); - revision.setArticleID(articleID); - revision.setTimeStamp(new Timestamp(result.getLong(6))); - revision.setFullRevisionID(result.getInt(7)); - revision.setContributorName(result.getString(8)); - revision.setContributorId(result.getInt(9)); - revision.setComment(result.getString(10)); - revision.setMinor(result.getBoolean(11)); - revision.setContributorIsRegistered(result.getBoolean(12)); - - return revision; - - } catch (DecodingException | SQLException | IOException | WikiApiException e) { - throw new RuntimeException(e); - } - } - - /** - * Returns whether another revision is available or not. - */ - @Override - public boolean hasNext() { - try { - if (result != null && result.next()) { - return true; - } - - // Close old queries - if (this.statement != null) { - this.statement.close(); - } - if (this.result != null) { - this.result.close(); - } - - if (primaryKey <= endPK) { // TODO: <= ? - return query(); - } - - return false; - - } catch (SQLException e) { - throw new RuntimeException(e); + catch (SQLException e) { + throw new RuntimeException(e); + } } - } - /** - * This method is unsupported and will result in a {@link UnsupportedOperationException}. - * - * @deprecated Don't call this method as it will result in an exception at runtime. - */ - @Override - @Deprecated(since = "1.1") - public void remove() { - throw new UnsupportedOperationException(); - } + /** + * This method is unsupported and will result in a {@link UnsupportedOperationException}. + * + * @deprecated Don't call this method as it will result in an exception at runtime. + */ + @Override + @Deprecated(since = "1.1") + public void remove() + { + throw new UnsupportedOperationException(); + } - @Deprecated(since = "1.1", forRemoval = true) - // TODO This should go into a demo or test class separated from the code here... - public static void main(final String[] args) throws Exception { + @Deprecated(since = "1.1", forRemoval = true) + // TODO This should go into a demo or test class separated from the code here... + public static void main(final String[] args) throws Exception + { - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - config.setHost("localhost"); - config.setDatabase("en_wiki"); - config.setUser("root"); - config.setPassword("1234"); + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + config.setHost("localhost"); + config.setDatabase("en_wiki"); + config.setUser("root"); + config.setPassword("1234"); - config.setCharacterSet("UTF-8"); - config.setBufferSize(20000); - config.setMaxAllowedPacket(16 * 1024 * 1023); + config.setCharacterSet("UTF-8"); + config.setBufferSize(20000); + config.setMaxAllowedPacket(16 * 1024 * 1023); - long count = 1; - long start = System.currentTimeMillis(); + long count = 1; + long start = System.currentTimeMillis(); - Revision rev; - Iterator<Revision> it = new RevisionIterator(config); + Revision rev; + Iterator<Revision> it = new RevisionIterator(config); - System.out.println(Time.toClock(System.currentTimeMillis() - start)); + System.out.println(Time.toClock(System.currentTimeMillis() - start)); - while (it.hasNext()) { - rev = it.next(); + while (it.hasNext()) { + rev = it.next(); - if (count++ % 10000 == 0) { + if (count++ % 10000 == 0) { - if (rev != null) { - System.out.println(rev); + if (rev != null) { + System.out.println(rev); + } + } } - } - } - // w.close(); - System.out.println(Time.toClock(System.currentTimeMillis() - start)); - } + // w.close(); + System.out.println(Time.toClock(System.currentTimeMillis() - start)); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java index 89589f87..0c149627 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/RevisionIteratorInterface.java @@ -24,16 +24,20 @@ /** * Extends the generic {@link java.util.Iterator} interface with a {@link #close()} method. * <p> - * Since the {@link IOException} does not have inner exception in JAVA 1.5 the close - * method has to throw both exception for both input components. + * Since the {@link IOException} does not have inner exception in JAVA 1.5 the close method has to + * throw both exception for both input components. */ -public interface RevisionIteratorInterface extends Iterator<Revision> { +public interface RevisionIteratorInterface + extends Iterator<Revision> +{ - /** - * Closes the reader or connection to the input component. - * - * @throws IOException if an error occurs while reading from the input archive. - * @throws SQLException if an error occurs while accessing the sql database. - */ - void close() throws IOException, SQLException; + /** + * Closes the reader or connection to the input component. + * + * @throws IOException + * if an error occurs while reading from the input archive. + * @throws SQLException + * if an error occurs while accessing the sql database. + */ + void close() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java index 19e82dfb..fe7bf3fe 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoFullRevision.java @@ -22,323 +22,349 @@ import org.dkpro.jwpl.revisionmachine.api.Revision; -public class ChronoFullRevision { - - /** - * PrimaryKey of the full revision - */ - private final int fullRevisionPK; - - /** - * First revision counter / revision counter of the full revision - */ - private final int startRC; - - /** - * Last revision counter based on the full revision - */ - private final int endRC; - - /** - * Reference to the chrono storage block - */ - private ChronoStorageBlock first; - - /** - * Set containing the IDs of revisions that could be reconstructed - */ - private final Set<Integer> set; - - /** - * Link to the next full revision - */ - private ChronoFullRevision next; - - /** - * Link to the previous full revision - */ - private ChronoFullRevision prev; - - /** - * Number of bytes contained in this object - */ - private long size; - - /** - * (Constructor) Creates a new ChronoFullRevision object. - * - * @param fullRevisionPK primary key of a full revision - * @param startRC revision counter of the full revision - * @param endRC last revision counter based on the full revision - */ - public ChronoFullRevision(final int fullRevisionPK, final int startRC, - final int endRC) { - - this.fullRevisionPK = fullRevisionPK; - this.startRC = startRC; - this.endRC = endRC; - - this.size = 0; - - this.set = new HashSet<>(); - for (int i = startRC; i <= endRC; i++) { - this.set.add(i); +public class ChronoFullRevision +{ + + /** + * PrimaryKey of the full revision + */ + private final int fullRevisionPK; + + /** + * First revision counter / revision counter of the full revision + */ + private final int startRC; + + /** + * Last revision counter based on the full revision + */ + private final int endRC; + + /** + * Reference to the chrono storage block + */ + private ChronoStorageBlock first; + + /** + * Set containing the IDs of revisions that could be reconstructed + */ + private final Set<Integer> set; + + /** + * Link to the next full revision + */ + private ChronoFullRevision next; + + /** + * Link to the previous full revision + */ + private ChronoFullRevision prev; + + /** + * Number of bytes contained in this object + */ + private long size; + + /** + * (Constructor) Creates a new ChronoFullRevision object. + * + * @param fullRevisionPK + * primary key of a full revision + * @param startRC + * revision counter of the full revision + * @param endRC + * last revision counter based on the full revision + */ + public ChronoFullRevision(final int fullRevisionPK, final int startRC, final int endRC) + { + + this.fullRevisionPK = fullRevisionPK; + this.startRC = startRC; + this.endRC = endRC; + + this.size = 0; + + this.set = new HashSet<>(); + for (int i = startRC; i <= endRC; i++) { + this.set.add(i); + } + } + + /** + * Returns the reference to the ChronoStorageBlock. + * + * @return chrono storage block + */ + public ChronoStorageBlock getFirst() + { + return this.first; + } + + /** + * Sets the reference of the ChronoStorageBlock. + * + * @param block + * chrono storage block + */ + public void setFirst(final ChronoStorageBlock block) + { + this.first = block; } - } - - /** - * Returns the reference to the ChronoStorageBlock. - * - * @return chrono storage block - */ - public ChronoStorageBlock getFirst() { - return this.first; - } - - /** - * Sets the reference of the ChronoStorageBlock. - * - * @param block chrono storage block - */ - public void setFirst(final ChronoStorageBlock block) { - this.first = block; - } - - /** - * Adds a ChonoStorageBlock to this chrono full revision object. - * - * @param block reference to the chrono storage block - */ - public void add(final ChronoStorageBlock block) { - - int revCount = block.getRevisionCounter(); - this.size += block.length(); - - if (first == null) { - first = block; - } else { - - ChronoStorageBlock previous = null, current = first; - do { - if (revCount < current.getRevisionCounter()) { - - block.setCounterPrev(previous); - block.setCounterNext(current); - - if (previous != null) { - previous.setCounterNext(block); - } - current.setCounterPrev(block); + /** + * Adds a ChonoStorageBlock to this chrono full revision object. + * + * @param block + * reference to the chrono storage block + */ + public void add(final ChronoStorageBlock block) + { - if (current == this.first) { - this.first = block; - } + int revCount = block.getRevisionCounter(); + this.size += block.length(); - return; + if (first == null) { + first = block; } + else { + + ChronoStorageBlock previous = null, current = first; + do { + if (revCount < current.getRevisionCounter()) { + + block.setCounterPrev(previous); + block.setCounterNext(current); + + if (previous != null) { + previous.setCounterNext(block); + } + + current.setCounterPrev(block); - previous = current; - current = current.getCounterNext(); + if (current == this.first) { + this.first = block; + } - } - while (current != null); + return; + } - // Add to end of list - previous.setCounterNext(block); - block.setCounterPrev(previous); + previous = current; + current = current.getCounterNext(); + + } + while (current != null); + + // Add to end of list + previous.setCounterNext(block); + block.setCounterPrev(previous); + } } - } - - /** - * Returns the nearest available revision to the specified revision counter. - * - * @param revisionCounter revision counter - * @return Revision - */ - public Revision getNearest(final int revisionCounter) { - - if (first != null) { - - ChronoStorageBlock previous = null, current = first; - while (current != null - && current.getRevisionCounter() <= revisionCounter) { - previous = current; - current = current.getCounterNext(); - } - - return previous.getRev(); + + /** + * Returns the nearest available revision to the specified revision counter. + * + * @param revisionCounter + * revision counter + * @return Revision + */ + public Revision getNearest(final int revisionCounter) + { + + if (first != null) { + + ChronoStorageBlock previous = null, current = first; + while (current != null && current.getRevisionCounter() <= revisionCounter) { + previous = current; + current = current.getCounterNext(); + } + + return previous.getRev(); + } + + return null; } - return null; - } - - /** - * Removes the revision counter from the list of reconstructble revisions. - * - * @param revisionCounter revision counter - */ - public void remove(final int revisionCounter) { - this.set.remove(revisionCounter); - if (this.set.isEmpty()) { - clean(0, 0); + /** + * Removes the revision counter from the list of reconstructble revisions. + * + * @param revisionCounter + * revision counter + */ + public void remove(final int revisionCounter) + { + this.set.remove(revisionCounter); + if (this.set.isEmpty()) { + clean(0, 0); + } } - } - - /** - * Returns whether more revisions can be reconstructed by the use of this - * chrono full revision. - * - * @return TRUE | FALSE - */ - public boolean isEmpty() { - return this.set.isEmpty(); - } - - /** - * Returns the next chrono full revision. - * - * @return next chrono full revision - */ - public ChronoFullRevision getNext() { - return next; - } - - /** - * Sets the link to the next chrono full revision. - * - * @param next next chrono full revision - */ - public void setNext(final ChronoFullRevision next) { - this.next = next; - } - - /** - * Returns the previous chrono full revision. - * - * @return previous chrono full revision - */ - public ChronoFullRevision getPrev() { - return prev; - } - - /** - * Sets the link to the previous chrono full revision. - * - * @param prev previous chrono full revision - */ - public void setPrev(final ChronoFullRevision prev) { - this.prev = prev; - } - - /** - * Reduces the storage space. - * - * @param currentRevisionIndex index of the current revision - * @param revisionIndex index of the revision - * @return size of used storage - */ - public long clean(final int currentRevisionIndex, final int revisionIndex) { - - if (first == null) { - return 0; - } else if (this.set.isEmpty()) { - this.first = null; - this.size = 0; - return 0; + + /** + * Returns whether more revisions can be reconstructed by the use of this chrono full revision. + * + * @return TRUE | FALSE + */ + public boolean isEmpty() + { + return this.set.isEmpty(); } - ChronoStorageBlock next, prev, current = first; - boolean remove; + /** + * Returns the next chrono full revision. + * + * @return next chrono full revision + */ + public ChronoFullRevision getNext() + { + return next; + } - do { - remove = false; + /** + * Sets the link to the next chrono full revision. + * + * @param next + * next chrono full revision + */ + public void setNext(final ChronoFullRevision next) + { + this.next = next; + } - if (current.isDelivered()) { + /** + * Returns the previous chrono full revision. + * + * @return previous chrono full revision + */ + public ChronoFullRevision getPrev() + { + return prev; + } - next = current.getCounterNext(); + /** + * Sets the link to the previous chrono full revision. + * + * @param prev + * previous chrono full revision + */ + public void setPrev(final ChronoFullRevision prev) + { + this.prev = prev; + } - if (next != null) { - if (current.getRevisionCounter() + 1 == next - .getRevisionCounter()) { - remove = true; - } + /** + * Reduces the storage space. + * + * @param currentRevisionIndex + * index of the current revision + * @param revisionIndex + * index of the revision + * @return size of used storage + */ + public long clean(final int currentRevisionIndex, final int revisionIndex) + { + + if (first == null) { + return 0; + } + else if (this.set.isEmpty()) { + this.first = null; + this.size = 0; + return 0; } - } else if (current.getIndexNext() == null - && current.getIndexPrev() == null) { + ChronoStorageBlock next, prev, current = first; + boolean remove; - remove = (current.getRevisionIndex() < currentRevisionIndex) - || (current.getRevisionIndex() == revisionIndex); - } + do { + remove = false; - if (remove) { - // System.out.println("Clearn CFR : " + - // current.getRevisionCounter()); + if (current.isDelivered()) { - prev = current.getCounterPrev(); - next = current.getCounterNext(); + next = current.getCounterNext(); - current.setCounterNext(null); - current.setCounterPrev(null); + if (next != null) { + if (current.getRevisionCounter() + 1 == next.getRevisionCounter()) { + remove = true; + } + } + + } + else if (current.getIndexNext() == null && current.getIndexPrev() == null) { + + remove = (current.getRevisionIndex() < currentRevisionIndex) + || (current.getRevisionIndex() == revisionIndex); + } + + if (remove) { + // System.out.println("Clearn CFR : " + + // current.getRevisionCounter()); + + prev = current.getCounterPrev(); + next = current.getCounterNext(); + + current.setCounterNext(null); + current.setCounterPrev(null); + + if (prev != null) { + prev.setCounterNext(next); + } + if (next != null) { + next.setCounterPrev(prev); + } + if (current == first) { + this.first = next; + } + + this.size -= current.length(); + current = next; + } + + if (current != null) { + current = current.getCounterNext(); + } - if (prev != null) { - prev.setCounterNext(next); - } - if (next != null) { - next.setCounterPrev(prev); - } - if (current == first) { - this.first = next; } + while (current != null); + + return this.size; + } - this.size -= current.length(); - current = next; - } + /** + * Returns the size of this chrono full revision. + * + * @return size + */ + public long size() + { + return this.size; + } - if (current != null) { - current = current.getCounterNext(); - } + /** + * Returns the last revision counter based on this full revision. + * + * @return last revision counter + */ + public int getEndRC() + { + return endRC; + } + + /** + * Returns the pk of the full revision. + * + * @return pk of the full revision + */ + public int getFullRevisionPK() + { + return fullRevisionPK; + } + /** + * Returns the revision counter of the full revision. + * + * @return first revision counter + */ + public int getStartRC() + { + return startRC; } - while (current != null); - - return this.size; - } - - /** - * Returns the size of this chrono full revision. - * - * @return size - */ - public long size() { - return this.size; - } - - /** - * Returns the last revision counter based on this full revision. - * - * @return last revision counter - */ - public int getEndRC() { - return endRC; - } - - /** - * Returns the pk of the full revision. - * - * @return pk of the full revision - */ - public int getFullRevisionPK() { - return fullRevisionPK; - } - - /** - * Returns the revision counter of the full revision. - * - * @return first revision counter - */ - public int getStartRC() { - return startRC; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java index 079623e8..7919d4c7 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoIterator.java @@ -33,332 +33,342 @@ /** * ChronoIterator Iterates articles in chronological order. */ -public class ChronoIterator { - - /** - * Reference to the configuration - */ - private final RevisionAPIConfiguration config; - - /** - * Reference to the database connection - */ - private final Connection connection; - - /** - * Reference to the ChronoStorage - */ - private final ChronoStorage chronoStorage; - - /** - * currently used article pk - */ - private final int articlePK; - - /** - * revision index - */ - private int revisionIndex; - - /** - * maximum revision - */ - private final int maxRevision; - - /** - * ChronoFullRevision Storage - */ - private final Map<Integer, ChronoFullRevision> fullRevStorage; - - /** - * Mapping chronological position to revision counter - */ - private final Map<Integer, Integer> mappingStorage; - - /** - * (Constructor) Create a ChronoIterator object - * - * @param config reference to the configuration - * @param connection reference to the database connection - * @param mapping mapping (chrono counter to revision counter) - * @param fullRevisionPKs space separated list of full revision pks - * @param revisionCounters space separated list of revision counter intervals - */ - public ChronoIterator(final RevisionAPIConfiguration config, - final Connection connection, final String mapping, - final String fullRevisionPKs, final String revisionCounters) { - - this.config = config; - this.connection = connection; - - int index = fullRevisionPKs.indexOf(' '); - if (index == -1) { - index = fullRevisionPKs.length(); - } +public class ChronoIterator +{ + + /** + * Reference to the configuration + */ + private final RevisionAPIConfiguration config; + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Reference to the ChronoStorage + */ + private final ChronoStorage chronoStorage; + + /** + * currently used article pk + */ + private final int articlePK; + + /** + * revision index + */ + private int revisionIndex; + + /** + * maximum revision + */ + private final int maxRevision; + + /** + * ChronoFullRevision Storage + */ + private final Map<Integer, ChronoFullRevision> fullRevStorage; + + /** + * Mapping chronological position to revision counter + */ + private final Map<Integer, Integer> mappingStorage; + + /** + * (Constructor) Create a ChronoIterator object + * + * @param config + * reference to the configuration + * @param connection + * reference to the database connection + * @param mapping + * mapping (chrono counter to revision counter) + * @param fullRevisionPKs + * space separated list of full revision pks + * @param revisionCounters + * space separated list of revision counter intervals + */ + public ChronoIterator(final RevisionAPIConfiguration config, final Connection connection, + final String mapping, final String fullRevisionPKs, final String revisionCounters) + { + + this.config = config; + this.connection = connection; + + int index = fullRevisionPKs.indexOf(' '); + if (index == -1) { + index = fullRevisionPKs.length(); + } - articlePK = Integer.parseInt(fullRevisionPKs.substring(0, index)); + articlePK = Integer.parseInt(fullRevisionPKs.substring(0, index)); - index = revisionCounters.lastIndexOf(' '); - if (index == -1) { - throw new RuntimeException("Invalid revisioncounter content"); - } + index = revisionCounters.lastIndexOf(' '); + if (index == -1) { + throw new RuntimeException("Invalid revisioncounter content"); + } - this.revisionIndex = 0; - this.maxRevision = Integer.parseInt(revisionCounters.substring( - index + 1, revisionCounters.length())); + this.revisionIndex = 0; + this.maxRevision = Integer + .parseInt(revisionCounters.substring(index + 1, revisionCounters.length())); - Map<Integer, Integer> reverseMappingStorage = new HashMap<>(); + Map<Integer, Integer> reverseMappingStorage = new HashMap<>(); - this.mappingStorage = new HashMap<>(); - this.fullRevStorage = new HashMap<>(); + this.mappingStorage = new HashMap<>(); + this.fullRevStorage = new HashMap<>(); - ChronoFullRevision previous = null, current, firstCFR = null; + ChronoFullRevision previous = null, current, firstCFR = null; - int length; - int revC, mapC; + int length; + int revC, mapC; - int max = mapping.length(); - length = 0; + int max = mapping.length(); + length = 0; - // Creates the mapping information for each revision - while (length < max) { + // Creates the mapping information for each revision + while (length < max) { - // Read revisionCounter - index = mapping.indexOf(' ', length); - revC = Integer.parseInt(mapping.substring(length, index)); - length = index + 1; + // Read revisionCounter + index = mapping.indexOf(' ', length); + revC = Integer.parseInt(mapping.substring(length, index)); + length = index + 1; - // Read mappedCounter - index = mapping.indexOf(' ', length); - if (index == -1) { - index = mapping.length(); - } - mapC = Integer.parseInt(mapping.substring(length, index)); - length = index + 1; + // Read mappedCounter + index = mapping.indexOf(' ', length); + if (index == -1) { + index = mapping.length(); + } + mapC = Integer.parseInt(mapping.substring(length, index)); + length = index + 1; - reverseMappingStorage.put(revC, mapC); - mappingStorage.put(mapC, revC); - } + reverseMappingStorage.put(revC, mapC); + mappingStorage.put(mapC, revC); + } - length = 0; - max = revisionCounters.length(); - int fullRevPK, lengthFR = 0; - - // Creates the full revision blocks for each full revision - while (length < max) { - - // Read fullRevisionPK (as string) - index = fullRevisionPKs.indexOf(' ', lengthFR); - if (index == -1) { - index = fullRevisionPKs.length(); - } - - fullRevPK = Integer.parseInt(fullRevisionPKs.substring(lengthFR, - index)); - lengthFR = index + 1; - - // Read start RC - index = revisionCounters.indexOf(' ', length); - revC = Integer.parseInt(revisionCounters.substring(length, index)); - length = index + 1; - - // Read end RC - index = revisionCounters.indexOf(' ', length); - if (index == -1) { - index = revisionCounters.length(); - } - mapC = Integer.parseInt(revisionCounters.substring(length, index)); - length = index + 1; - - // Constructs a double linked list containing the full revision - current = new ChronoFullRevision(fullRevPK, revC, mapC); - if (firstCFR == null) { - firstCFR = current; - } else { - current.setPrev(previous); - previous.setNext(current); - } - - // Add index information for each revision contained in such - // a block - for (int i = revC; i <= mapC; i++) { - fullRevStorage.put(i, current); - } - - previous = current; - } + length = 0; + max = revisionCounters.length(); + int fullRevPK, lengthFR = 0; + + // Creates the full revision blocks for each full revision + while (length < max) { + + // Read fullRevisionPK (as string) + index = fullRevisionPKs.indexOf(' ', lengthFR); + if (index == -1) { + index = fullRevisionPKs.length(); + } + + fullRevPK = Integer.parseInt(fullRevisionPKs.substring(lengthFR, index)); + lengthFR = index + 1; + + // Read start RC + index = revisionCounters.indexOf(' ', length); + revC = Integer.parseInt(revisionCounters.substring(length, index)); + length = index + 1; + + // Read end RC + index = revisionCounters.indexOf(' ', length); + if (index == -1) { + index = revisionCounters.length(); + } + mapC = Integer.parseInt(revisionCounters.substring(length, index)); + length = index + 1; + + // Constructs a double linked list containing the full revision + current = new ChronoFullRevision(fullRevPK, revC, mapC); + if (firstCFR == null) { + firstCFR = current; + } + else { + current.setPrev(previous); + previous.setNext(current); + } + + // Add index information for each revision contained in such + // a block + for (int i = revC; i <= mapC; i++) { + fullRevStorage.put(i, current); + } + + previous = current; + } - // Create ChronoStorage object - this.chronoStorage = new ChronoStorage(config, reverseMappingStorage, - firstCFR, fullRevStorage); - } - - /** - * Returns if all revision have retrieved. - * - * @return - */ - public boolean hasNext() { - return ++revisionIndex <= maxRevision; - } - - /** - * Returns the next revision. - * - * @return next revision - */ - public Revision next() - throws Exception { - - // Checks whether the next revision has already been reconstructed. - Revision revision; - if (chronoStorage.isTop(revisionIndex)) { - - // If this is the case the revision will removed from the storage - return chronoStorage.remove(); + // Create ChronoStorage object + this.chronoStorage = new ChronoStorage(config, reverseMappingStorage, firstCFR, + fullRevStorage); } - // Otherwise the chronological order counter will be mapped to the - // revsision counter - int revCount = revisionIndex; - if (mappingStorage.containsKey(revisionIndex)) { - revCount = mappingStorage.get(revisionIndex); + /** + * Returns if all revision have retrieved. + * + * @return + */ + public boolean hasNext() + { + return ++revisionIndex <= maxRevision; } - // Retrieve the related full revision block - ChronoFullRevision cfr = fullRevStorage.get(revCount); + /** + * Returns the next revision. + * + * @return next revision + */ + public Revision next() throws Exception + { + + // Checks whether the next revision has already been reconstructed. + Revision revision; + if (chronoStorage.isTop(revisionIndex)) { - int queryPK, limit, previousRevisionCounter; - String previousRevision; + // If this is the case the revision will removed from the storage + return chronoStorage.remove(); + } - // Determine the nearest revision that could be used to construct - // the specified revision - revision = cfr.getNearest(revCount); - if (revision == null) { + // Otherwise the chronological order counter will be mapped to the + // revsision counter + int revCount = revisionIndex; + if (mappingStorage.containsKey(revisionIndex)) { + revCount = mappingStorage.get(revisionIndex); + } - // Create query bounds (all revisions from the full revision till - // now) - queryPK = articlePK + cfr.getStartRC() - 1; - limit = revCount - cfr.getStartRC() + 1; + // Retrieve the related full revision block + ChronoFullRevision cfr = fullRevStorage.get(revCount); - previousRevision = null; - previousRevisionCounter = -1; + int queryPK, limit, previousRevisionCounter; + String previousRevision; - } else { + // Determine the nearest revision that could be used to construct + // the specified revision + revision = cfr.getNearest(revCount); + if (revision == null) { - // Create query bounds (only new revisions, last known + 1 till now) - queryPK = revision.getPrimaryKey() + 1; - limit = revCount - revision.getRevisionCounter(); + // Create query bounds (all revisions from the full revision till + // now) + queryPK = articlePK + cfr.getStartRC() - 1; + limit = revCount - cfr.getStartRC() + 1; - previousRevision = revision.getRevisionText(); - previousRevisionCounter = revision.getRevisionCounter(); + previousRevision = null; + previousRevisionCounter = -1; - } + } + else { - revision = null; + // Create query bounds (only new revisions, last known + 1 till now) + queryPK = revision.getPrimaryKey() + 1; + limit = revCount - revision.getRevisionCounter(); - try (Statement statement = this.connection.createStatement(); ResultSet result = statement - .executeQuery("SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp " - + "FROM revisions " - + "WHERE PrimaryKey >= " - + queryPK + " LIMIT " + limit)) { + previousRevision = revision.getRevisionText(); + previousRevisionCounter = revision.getRevisionCounter(); - // Retrieve encoded revisions + } - String currentRevision; + revision = null; - Diff diff; - RevisionDecoder decoder; + try (Statement statement = this.connection.createStatement(); + ResultSet result = statement.executeQuery( + "SELECT Revision, PrimaryKey, RevisionCounter, RevisionID, ArticleID, Timestamp " + + "FROM revisions " + "WHERE PrimaryKey >= " + queryPK + " LIMIT " + + limit)) { - boolean binaryData = result.getMetaData().getColumnType(1) == Types.LONGVARBINARY; + // Retrieve encoded revisions - while (result.next()) { + String currentRevision; - decoder = new RevisionDecoder(config.getCharacterSet()); + Diff diff; + RevisionDecoder decoder; - // binary or base64 encoded - if (binaryData) { - decoder.setInput(result.getBinaryStream(1), true); - } else { - decoder.setInput(result.getString(1)); - } + boolean binaryData = result.getMetaData().getColumnType(1) == Types.LONGVARBINARY; - // Decode and rebuild - diff = decoder.decode(); - if (previousRevisionCounter != -1) { + while (result.next()) { - if (previousRevisionCounter + 1 != result.getInt(3)) { + decoder = new RevisionDecoder(config.getCharacterSet()); - System.err.println("Reconstruction data invalid - " - + "\r\n\t" + "Expected " - + (previousRevisionCounter + 1) - + " instead of " + result.getInt(3)); + // binary or base64 encoded + if (binaryData) { + decoder.setInput(result.getBinaryStream(1), true); + } + else { + decoder.setInput(result.getString(1)); + } - return null; - } + // Decode and rebuild + diff = decoder.decode(); + if (previousRevisionCounter != -1) { - } else { + if (previousRevisionCounter + 1 != result.getInt(3)) { - if (cfr.getStartRC() != result.getInt(3)) { + System.err.println("Reconstruction data invalid - " + "\r\n\t" + "Expected " + + (previousRevisionCounter + 1) + " instead of " + + result.getInt(3)); - System.err.println("Reconstruction data invalid - " - + "\r\n\t" + "Expected " + (cfr.getStartRC()) - + " instead of " + result.getInt(3)); + return null; + } - return null; - } + } + else { - } + if (cfr.getStartRC() != result.getInt(3)) { - try { - currentRevision = diff.buildRevision(previousRevision); + System.err.println("Reconstruction data invalid - " + "\r\n\t" + "Expected " + + (cfr.getStartRC()) + " instead of " + result.getInt(3)); - revision = new Revision(result.getInt(3)); - revision.setRevisionText(currentRevision); - revision.setPrimaryKey(result.getInt(2)); - revision.setRevisionID(result.getInt(4)); - revision.setArticleID(result.getInt(5)); - revision.setTimeStamp(new Timestamp(result.getLong(6))); + return null; + } - previousRevision = currentRevision; - previousRevisionCounter = revision.getRevisionCounter(); + } - } catch (Exception e) { + try { + currentRevision = diff.buildRevision(previousRevision); - System.err.println("Reconstruction failed while retrieving" - + " data to reconstruct <" + revisionIndex + ">" - + "\r\n\t" + "[ArticleId " + result.getInt(5) - + ", RevisionId " + result.getInt(4) - + ", RevisionCounter " + result.getInt(3) + "]"); + revision = new Revision(result.getInt(3)); + revision.setRevisionText(currentRevision); + revision.setPrimaryKey(result.getInt(2)); + revision.setRevisionID(result.getInt(4)); + revision.setArticleID(result.getInt(5)); + revision.setTimeStamp(new Timestamp(result.getLong(6))); - return null; - } + previousRevision = currentRevision; + previousRevisionCounter = revision.getRevisionCounter(); - // Add the reconstructed revision to the storage - if (revision != null) { - chronoStorage.add(revision); - } - } + } + catch (Exception e) { - // Ensure that the correct revision is on top of the storage - if (chronoStorage.isTop(revisionIndex)) { + System.err.println("Reconstruction failed while retrieving" + + " data to reconstruct <" + revisionIndex + ">" + "\r\n\t" + + "[ArticleId " + result.getInt(5) + ", RevisionId " + result.getInt(4) + + ", RevisionCounter " + result.getInt(3) + "]"); - chronoStorage.remove(); - return revision; + return null; + } - } else { - return null; - } + // Add the reconstructed revision to the storage + if (revision != null) { + chronoStorage.add(revision); + } + } + + // Ensure that the correct revision is on top of the storage + if (chronoStorage.isTop(revisionIndex)) { + + chronoStorage.remove(); + return revision; + + } + else { + return null; + } + + } + } + /** + * Returns the storage size description. + * + * @return storage size description + */ + public String getStorageSize() + { + return chronoStorage.getStorageSize(); } - } - - /** - * Returns the storage size description. - * - * @return storage size description - */ - public String getStorageSize() { - return chronoStorage.getStorageSize(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java index 09d3c929..58c26230 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorage.java @@ -26,314 +26,329 @@ /** * This class represents the chrono storage. */ -public class ChronoStorage { - - /** - * Index of the currently used revision - */ - private int revisionIndex; - - /** - * Reference to the first chrono storage block - */ - private ChronoStorageBlock first; - - /** - * Reference to the last chrono storage block - */ - private ChronoStorageBlock last; - - /** - * Map containing the chrono storage block and their index keys - */ - private final Map<Integer, ChronoStorageBlock> storage; - - /** - * Reverse mapping - */ - private final Map<Integer, Integer> mapping; - - /** - * Map containing reference to the chrono full revisions (Mapping of - * revision counter and their full revision blocks) - */ - private final Map<Integer, ChronoFullRevision> fullRevStorage; - - /** - * Reference to the first chrono full revision - */ - private final ChronoFullRevision firstCFR; - - /** - * Size of the chrono storage - */ - private long size; - - /** - * Configuration parameter - maximum size of this storage - */ - private final long MAX_STORAGE_SIZE; - - /** - * (Constructor) Creates a ChronoStorage object - * - * @param config Reference to the configuration - * @param mapping Mapping information (revision counter -> chronological - * revision counter) - * @param firstCFR Head of the double linked list of full revisions blocks - * @param fullRevStorage Mapping of revision counter and their full revision blocks - */ - public ChronoStorage(final RevisionAPIConfiguration config, - final Map<Integer, Integer> mapping, - final ChronoFullRevision firstCFR, - final Map<Integer, ChronoFullRevision> fullRevStorage) { - - this.revisionIndex = 0; - this.last = null; - this.first = null; - this.storage = new HashMap<>(); - - this.mapping = mapping; - this.fullRevStorage = fullRevStorage; - this.firstCFR = firstCFR; - - MAX_STORAGE_SIZE = config.getChronoStorageSpace(); - } - - /** - * Adds a revision to the chrono storage. - * - * @param rev reference to the revision - */ - public void add(final Revision rev) { - - int revIndex = rev.getRevisionCounter(); - if (this.mapping.containsKey(revIndex)) { - revIndex = this.mapping.get(revIndex); - } +public class ChronoStorage +{ - // System.out.println("Store " + rev.getRevisionCounter() + " with " + - // revIndex); + /** + * Index of the currently used revision + */ + private int revisionIndex; - ChronoFullRevision cfr = this.fullRevStorage.get(rev - .getRevisionCounter()); - ChronoStorageBlock block = new ChronoStorageBlock(cfr, revIndex, rev); - cfr.add(block); + /** + * Reference to the first chrono storage block + */ + private ChronoStorageBlock first; - if (revIndex < revisionIndex) { - // System.out.println("Revision has already been processed: " + - // revIndex); - block.setDelivered(true); - return; - } + /** + * Reference to the last chrono storage block + */ + private ChronoStorageBlock last; + + /** + * Map containing the chrono storage block and their index keys + */ + private final Map<Integer, ChronoStorageBlock> storage; + + /** + * Reverse mapping + */ + private final Map<Integer, Integer> mapping; + + /** + * Map containing reference to the chrono full revisions (Mapping of revision counter and their + * full revision blocks) + */ + private final Map<Integer, ChronoFullRevision> fullRevStorage; + + /** + * Reference to the first chrono full revision + */ + private final ChronoFullRevision firstCFR; - clean(); + /** + * Size of the chrono storage + */ + private long size; - if (this.storage.containsKey(revIndex)) { - // throw new IllegalArgumentException(revisionIndex + - // "- Object already contained: " + revIndex); - return; + /** + * Configuration parameter - maximum size of this storage + */ + private final long MAX_STORAGE_SIZE; + + /** + * (Constructor) Creates a ChronoStorage object + * + * @param config + * Reference to the configuration + * @param mapping + * Mapping information (revision counter -> chronological revision counter) + * @param firstCFR + * Head of the double linked list of full revisions blocks + * @param fullRevStorage + * Mapping of revision counter and their full revision blocks + */ + public ChronoStorage(final RevisionAPIConfiguration config, final Map<Integer, Integer> mapping, + final ChronoFullRevision firstCFR, + final Map<Integer, ChronoFullRevision> fullRevStorage) + { + + this.revisionIndex = 0; + this.last = null; + this.first = null; + this.storage = new HashMap<>(); + + this.mapping = mapping; + this.fullRevStorage = fullRevStorage; + this.firstCFR = firstCFR; + + MAX_STORAGE_SIZE = config.getChronoStorageSpace(); } - storage.put(revIndex, block); - size += block.length(); + /** + * Adds a revision to the chrono storage. + * + * @param rev + * reference to the revision + */ + public void add(final Revision rev) + { - if (first == null) { - first = block; - last = block; - } else { + int revIndex = rev.getRevisionCounter(); + if (this.mapping.containsKey(revIndex)) { + revIndex = this.mapping.get(revIndex); + } - ChronoStorageBlock previous = null, current = first; - do { - if (revIndex < current.getRevisionIndex()) { + // System.out.println("Store " + rev.getRevisionCounter() + " with " + + // revIndex); - block.setIndexPrev(previous); - block.setIndexNext(current); + ChronoFullRevision cfr = this.fullRevStorage.get(rev.getRevisionCounter()); + ChronoStorageBlock block = new ChronoStorageBlock(cfr, revIndex, rev); + cfr.add(block); - if (previous != null) { - previous.setIndexNext(block); - } - current.setIndexPrev(block); + if (revIndex < revisionIndex) { + // System.out.println("Revision has already been processed: " + + // revIndex); + block.setDelivered(true); + return; + } - if (current == first) { - this.first = block; - } + clean(); - return; + if (this.storage.containsKey(revIndex)) { + // throw new IllegalArgumentException(revisionIndex + + // "- Object already contained: " + revIndex); + return; } - previous = current; - current = current.getIndexNext(); + storage.put(revIndex, block); + size += block.length(); + + if (first == null) { + first = block; + last = block; + } + else { + + ChronoStorageBlock previous = null, current = first; + do { + if (revIndex < current.getRevisionIndex()) { + + block.setIndexPrev(previous); + block.setIndexNext(current); + + if (previous != null) { + previous.setIndexNext(block); + } + current.setIndexPrev(block); + + if (current == first) { + this.first = block; + } + + return; + } + + previous = current; + current = current.getIndexNext(); - } - while (current != null); + } + while (current != null); - // Add to end of list - previous.setIndexNext(block); - block.setIndexPrev(previous); + // Add to end of list + previous.setIndexNext(block); + block.setIndexPrev(previous); - this.last = block; + this.last = block; + } } - } - - /** - * Returns whether more chrono storage blocks are available. - * - * @return TRUE | FALSE - */ - public boolean hasMore() { - return this.first != null; - } - - /** - * Removes a revision from the chrono storage. - * - * @return - */ - public Revision remove() { - - ChronoStorageBlock block = first; - this.revisionIndex = block.getRevisionIndex(); - - ChronoStorageBlock next = block.getIndexNext(); - this.first = next; - - if (next != null) { - this.first.setIndexPrev(null); - } else { - this.last = null; + + /** + * Returns whether more chrono storage blocks are available. + * + * @return TRUE | FALSE + */ + public boolean hasMore() + { + return this.first != null; } - /* - * System.out.println("Deliver " + block.getRevisionIndex() + " RI|RC " - * + block.getRevisionCounter()); if (first != null) { - * System.out.println("OnTop: " + first.getRevisionIndex()); } + /** + * Removes a revision from the chrono storage. + * + * @return */ - block.setDelivered(true); + public Revision remove() + { - // Remove from fullRevSet - ChronoFullRevision cfr = block.getChronoFullRevision(); - cfr.remove(block.getRevisionCounter()); + ChronoStorageBlock block = first; + this.revisionIndex = block.getRevisionIndex(); - if (storage.remove(block.getRevisionIndex()) == null) { - throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); - } + ChronoStorageBlock next = block.getIndexNext(); + this.first = next; - // Subtract size - Revision rev = block.getRev(); - size -= rev.getRevisionText().length(); - return rev; - } - - /** - * Checks whether the specified chrono storage block is contained or not. - * - * @param revisionIndex chronological order index - * @return - */ - public boolean contains(final int revisionIndex) { - return this.storage.containsKey(revisionIndex); - } - - /** - * Checks whether the chrono storage block is on top or not. - * - * @param revisionIndex chronological order index - * @return - */ - public boolean isTop(final int revisionIndex) { - if (this.first != null) { - return this.first.getRevisionIndex() == revisionIndex; - } + if (next != null) { + this.first.setIndexPrev(null); + } + else { + this.last = null; + } - return false; - } + /* + * System.out.println("Deliver " + block.getRevisionIndex() + " RI|RC " + + * block.getRevisionCounter()); if (first != null) { System.out.println("OnTop: " + + * first.getRevisionIndex()); } + */ + block.setDelivered(true); - /** - * Returns the revision of the specified chrono storage block. - * - * @param revisionIndex chronological order index - * @return - */ - public Revision get(final int revisionIndex) { - if (this.storage.containsKey(revisionIndex)) { + // Remove from fullRevSet + ChronoFullRevision cfr = block.getChronoFullRevision(); + cfr.remove(block.getRevisionCounter()); - ChronoStorageBlock block = this.storage.get(revisionIndex); - return block.getRev(); + if (storage.remove(block.getRevisionIndex()) == null) { + throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); + } + + // Subtract size + Revision rev = block.getRev(); + size -= rev.getRevisionText().length(); + return rev; } - return null; - } - - /** - * Temporary variable - total size of the chrono storage - */ - private long totalSize; - - /** - * Reduces the amount of used storage by discarding chrono storage blocks. - */ - public void clean() { - - ChronoFullRevision cfr = firstCFR; - totalSize = size; - while (cfr != null) { - totalSize += cfr.size(); - cfr = cfr.getNext(); + + /** + * Checks whether the specified chrono storage block is contained or not. + * + * @param revisionIndex + * chronological order index + * @return + */ + public boolean contains(final int revisionIndex) + { + return this.storage.containsKey(revisionIndex); } - if (totalSize < MAX_STORAGE_SIZE) { - return; + /** + * Checks whether the chrono storage block is on top or not. + * + * @param revisionIndex + * chronological order index + * @return + */ + public boolean isTop(final int revisionIndex) + { + if (this.first != null) { + return this.first.getRevisionIndex() == revisionIndex; + } + + return false; } - cfr = firstCFR; - while (cfr != null) { - totalSize += cfr.clean(revisionIndex, 0); - cfr = cfr.getNext(); + /** + * Returns the revision of the specified chrono storage block. + * + * @param revisionIndex + * chronological order index + * @return + */ + public Revision get(final int revisionIndex) + { + if (this.storage.containsKey(revisionIndex)) { + + ChronoStorageBlock block = this.storage.get(revisionIndex); + return block.getRev(); + } + return null; } - ChronoStorageBlock block; - while (last != null && totalSize >= MAX_STORAGE_SIZE) { + /** + * Temporary variable - total size of the chrono storage + */ + private long totalSize; - // System.out.println("CLEAN " + last.getRevisionIndex()); + /** + * Reduces the amount of used storage by discarding chrono storage blocks. + */ + public void clean() + { + + ChronoFullRevision cfr = firstCFR; + totalSize = size; + while (cfr != null) { + totalSize += cfr.size(); + cfr = cfr.getNext(); + } - // Retrieve previous block - block = last.getIndexPrev(); + if (totalSize < MAX_STORAGE_SIZE) { + return; + } - // Subtract size - if (storage.remove(last.getRevisionIndex()) == null) { - throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); - } - totalSize -= last.length(); - size += last.length(); + cfr = firstCFR; + while (cfr != null) { + totalSize += cfr.clean(revisionIndex, 0); + cfr = cfr.getNext(); + } - // Delete references - if (block != null) { - block.setIndexNext(null); - } - last.setIndexPrev(null); + ChronoStorageBlock block; + while (last != null && totalSize >= MAX_STORAGE_SIZE) { - cfr = last.getChronoFullRevision(); - totalSize += cfr.size() - - cfr.clean(revisionIndex, last.getRevisionIndex()); + // System.out.println("CLEAN " + last.getRevisionIndex()); - if (last == first) { - first = null; - } + // Retrieve previous block + block = last.getIndexPrev(); - // Set the new last - last = block; - } + // Subtract size + if (storage.remove(last.getRevisionIndex()) == null) { + throw new RuntimeException("VALUE WAS NOT REMOVED FROM STORAGE"); + } + totalSize -= last.length(); + size += last.length(); + // Delete references + if (block != null) { + block.setIndexNext(null); + } + last.setIndexPrev(null); - } + cfr = last.getChronoFullRevision(); + totalSize += cfr.size() - cfr.clean(revisionIndex, last.getRevisionIndex()); - /** - * Returns a description of the chrono storage size. - * - * @return current revision index | storage size | size | total size - */ - public String getStorageSize() { - return this.revisionIndex + " | " + this.storage.size() + " | " - + this.size + " | " + totalSize; - } + if (last == first) { + first = null; + } + + // Set the new last + last = block; + } + + } + + /** + * Returns a description of the chrono storage size. + * + * @return current revision index | storage size | size | total size + */ + public String getStorageSize() + { + return this.revisionIndex + " | " + this.storage.size() + " | " + this.size + " | " + + totalSize; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java index 1df39b63..984a029a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/api/chrono/ChronoStorageBlock.java @@ -24,193 +24,216 @@ * <p> * A node contains multiple links: - Reference to the chrono full revision * <p> - * - links to the previous and next index block an index reference describes the - * chronological order + * - links to the previous and next index block an index reference describes the chronological order * <p> - * - links to the previous and next counter block an counter reference describes - * the normal order + * - links to the previous and next counter block an counter reference describes the normal order */ -public class ChronoStorageBlock { - - /** - * Reference to the chrono full revision - */ - private final ChronoFullRevision cfr; - - /** - * Index of the revision - */ - private final int revisionIndex; - - /** - * Revision - */ - private final Revision rev; - - /** - * Flag, indicating whether the revision was already returned or not - */ - private boolean delivered; - - /** - * Reference to the previous index block - */ - private ChronoStorageBlock indexPrev; - - /** - * Reference to the next index block - */ - private ChronoStorageBlock indexNext; - - /** - * Reference to the previous counter block - */ - private ChronoStorageBlock counterPrev; - - /** - * Reference to the next counter block - */ - private ChronoStorageBlock counterNext; - - /** - * Returns the related chrono full revision. - * - * @return chrono full revision - */ - public ChronoFullRevision getChronoFullRevision() { - return this.cfr; - } - - /** - * Returns the next counter block. - * - * @return next counter block - */ - public ChronoStorageBlock getCounterNext() { - return counterNext; - } - - /** - * Sets the next counter block. - * - * @param counterNext next counter block - */ - public void setCounterNext(final ChronoStorageBlock counterNext) { - this.counterNext = counterNext; - } - - /** - * Returns the previous counter block. - * - * @return previous counter block - */ - public ChronoStorageBlock getCounterPrev() { - return counterPrev; - } - - /** - * Sets the previous counter block. - * - * @param counterPrev previous counter block - */ - public void setCounterPrev(final ChronoStorageBlock counterPrev) { - this.counterPrev = counterPrev; - } - - /** - * Returns the next index block. - * - * @return next index block - */ - public ChronoStorageBlock getIndexNext() { - return indexNext; - } - - /** - * Sets the next index block. - * - * @param indexNext next index block - */ - public void setIndexNext(final ChronoStorageBlock indexNext) { - this.indexNext = indexNext; - } - - /** - * Returns the previous index block. - * - * @return previous index block - */ - public ChronoStorageBlock getIndexPrev() { - return indexPrev; - } - - /** - * Sets the previous index block. - * - * @param indexPrev previous counter block - */ - public void setIndexPrev(final ChronoStorageBlock indexPrev) { - this.indexPrev = indexPrev; - } - - /** - * (Constructor) Creates a new ChronoStorageBlock. - * - * @param cfr Reference to the chrono full revision - * @param revisionIndex Index of this revision - * @param rev Reference to the revision - */ - public ChronoStorageBlock(final ChronoFullRevision cfr, - final int revisionIndex, final Revision rev) { - - this.cfr = cfr; - - this.revisionIndex = revisionIndex; - this.rev = rev; - this.delivered = false; - } - - public Revision getRev() { - return rev; - } - - /** - * Returns whether this revision was already returned or not. - * - * @return flag - */ - public boolean isDelivered() { - return delivered; - } - - /** - * Sets whether this revision was already returned or not. - * - * @param delivered flag - */ - public void setDelivered(final boolean delivered) { - this.delivered = delivered; - } - - /** - * Returns the revision index. - * - * @return revision index - */ - public int getRevisionIndex() { - return revisionIndex; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() { - return this.rev.getRevisionCounter(); - } - - public int length() { - return this.rev.getRevisionText().length(); - } +public class ChronoStorageBlock +{ + + /** + * Reference to the chrono full revision + */ + private final ChronoFullRevision cfr; + + /** + * Index of the revision + */ + private final int revisionIndex; + + /** + * Revision + */ + private final Revision rev; + + /** + * Flag, indicating whether the revision was already returned or not + */ + private boolean delivered; + + /** + * Reference to the previous index block + */ + private ChronoStorageBlock indexPrev; + + /** + * Reference to the next index block + */ + private ChronoStorageBlock indexNext; + + /** + * Reference to the previous counter block + */ + private ChronoStorageBlock counterPrev; + + /** + * Reference to the next counter block + */ + private ChronoStorageBlock counterNext; + + /** + * Returns the related chrono full revision. + * + * @return chrono full revision + */ + public ChronoFullRevision getChronoFullRevision() + { + return this.cfr; + } + + /** + * Returns the next counter block. + * + * @return next counter block + */ + public ChronoStorageBlock getCounterNext() + { + return counterNext; + } + + /** + * Sets the next counter block. + * + * @param counterNext + * next counter block + */ + public void setCounterNext(final ChronoStorageBlock counterNext) + { + this.counterNext = counterNext; + } + + /** + * Returns the previous counter block. + * + * @return previous counter block + */ + public ChronoStorageBlock getCounterPrev() + { + return counterPrev; + } + + /** + * Sets the previous counter block. + * + * @param counterPrev + * previous counter block + */ + public void setCounterPrev(final ChronoStorageBlock counterPrev) + { + this.counterPrev = counterPrev; + } + + /** + * Returns the next index block. + * + * @return next index block + */ + public ChronoStorageBlock getIndexNext() + { + return indexNext; + } + + /** + * Sets the next index block. + * + * @param indexNext + * next index block + */ + public void setIndexNext(final ChronoStorageBlock indexNext) + { + this.indexNext = indexNext; + } + + /** + * Returns the previous index block. + * + * @return previous index block + */ + public ChronoStorageBlock getIndexPrev() + { + return indexPrev; + } + + /** + * Sets the previous index block. + * + * @param indexPrev + * previous counter block + */ + public void setIndexPrev(final ChronoStorageBlock indexPrev) + { + this.indexPrev = indexPrev; + } + + /** + * (Constructor) Creates a new ChronoStorageBlock. + * + * @param cfr + * Reference to the chrono full revision + * @param revisionIndex + * Index of this revision + * @param rev + * Reference to the revision + */ + public ChronoStorageBlock(final ChronoFullRevision cfr, final int revisionIndex, + final Revision rev) + { + + this.cfr = cfr; + + this.revisionIndex = revisionIndex; + this.rev = rev; + this.delivered = false; + } + + public Revision getRev() + { + return rev; + } + + /** + * Returns whether this revision was already returned or not. + * + * @return flag + */ + public boolean isDelivered() + { + return delivered; + } + + /** + * Sets whether this revision was already returned or not. + * + * @param delivered + * flag + */ + public void setDelivered(final boolean delivered) + { + this.delivered = delivered; + } + + /** + * Returns the revision index. + * + * @return revision index + */ + public int getRevisionIndex() + { + return revisionIndex; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() + { + return this.rev.getRevisionCounter(); + } + + public int length() + { + return this.rev.getRevisionText().length(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java index f03d757c..dc79870b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/archivers/Bzip2Archiver.java @@ -32,145 +32,155 @@ /** * Class provides basic bzip2 compression/decompression functionality */ -public class Bzip2Archiver { +public class Bzip2Archiver +{ - // Size to write in memory while compressing (in bytes) - private static final int COMPRESSION_CACHE = 10000000; + // Size to write in memory while compressing (in bytes) + private static final int COMPRESSION_CACHE = 10000000; - // Size to write in memory while decompressing (in bytes) - private static final int DECOMPRESSION_CACHE = 10000000; + // Size to write in memory while decompressing (in bytes) + private static final int DECOMPRESSION_CACHE = 10000000; - /** - * Creates bz2 archive file from file in path - * - * @param path to file to compress - */ - public void compress(String path) { - try { + /** + * Creates bz2 archive file from file in path + * + * @param path + * to file to compress + */ + public void compress(String path) + { + try { - File fileToArchive = new File(path); + File fileToArchive = new File(path); - BufferedInputStream input = new BufferedInputStream(new FileInputStream(fileToArchive)); + BufferedInputStream input = new BufferedInputStream(new FileInputStream(fileToArchive)); - File archivedFile = new File(fileToArchive.getName() + ".bz2"); - archivedFile.createNewFile(); + File archivedFile = new File(fileToArchive.getName() + ".bz2"); + archivedFile.createNewFile(); - FileOutputStream fos = new FileOutputStream(archivedFile); - BufferedOutputStream bufStr = new BufferedOutputStream(fos); - // added bzip2 prefix - fos.write("BZ".getBytes()); - BZip2CompressorOutputStream bzip2 = new BZip2CompressorOutputStream(bufStr); + FileOutputStream fos = new FileOutputStream(archivedFile); + BufferedOutputStream bufStr = new BufferedOutputStream(fos); + // added bzip2 prefix + fos.write("BZ".getBytes()); + BZip2CompressorOutputStream bzip2 = new BZip2CompressorOutputStream(bufStr); - while (input.available() > 0) { - int size = COMPRESSION_CACHE; + while (input.available() > 0) { + int size = COMPRESSION_CACHE; - if (input.available() < COMPRESSION_CACHE) { - size = input.available(); - } - byte[] bytes = new byte[size]; - - input.read(bytes); - - bzip2.write(bytes); - } - bzip2.close(); - bufStr.close(); - fos.close(); - input.close(); + if (input.available() < COMPRESSION_CACHE) { + size = input.available(); + } + byte[] bytes = new byte[size]; - } catch (IOException e) { - e.printStackTrace(); - } + input.read(bytes); - } + bzip2.write(bytes); + } + bzip2.close(); + bufStr.close(); + fos.close(); + input.close(); - /** - * Creates stream for compression - * - * @param path path to file to compress - * @return compression stream - * @throws IOException - */ - public OutputStream getCompressionStream(String path) throws IOException { - File archivedFile = new File(path); + } + catch (IOException e) { + e.printStackTrace(); + } - archivedFile.createNewFile(); - FileOutputStream fos = new FileOutputStream(archivedFile); + } - BufferedOutputStream bufStr = new BufferedOutputStream(fos); - // added bzip2 prefix - fos.write("BZ".getBytes()); + /** + * Creates stream for compression + * + * @param path + * path to file to compress + * @return compression stream + * @throws IOException + */ + public OutputStream getCompressionStream(String path) throws IOException + { + File archivedFile = new File(path); + + archivedFile.createNewFile(); + FileOutputStream fos = new FileOutputStream(archivedFile); + + BufferedOutputStream bufStr = new BufferedOutputStream(fos); + // added bzip2 prefix + fos.write("BZ".getBytes()); + + return new BZip2CompressorOutputStream(bufStr); + } - return new BZip2CompressorOutputStream(bufStr); - } + /** + * Creates Stream for decompression + * + * @param path + * path to file to decompress + * @param encoding + * encoding to use + * @return decompression stream + * @throws IOException + */ + public InputStreamReader getDecompressionStream(String path, String encoding) throws IOException + { + File fileToUncompress = new File(path); - /** - * Creates Stream for decompression - * - * @param path path to file to decompress - * @param encoding encoding to use - * @return decompression stream - * @throws IOException - */ - public InputStreamReader getDecompressionStream(String path, String encoding) - throws IOException { - File fileToUncompress = new File(path); + BufferedInputStream fileStream = new BufferedInputStream( + new FileInputStream(fileToUncompress)); - BufferedInputStream fileStream = new BufferedInputStream(new FileInputStream(fileToUncompress)); + // read bzip2 prefix: BZ + fileStream.read(); + fileStream.read(); - // read bzip2 prefix: BZ - fileStream.read(); - fileStream.read(); + BufferedInputStream bufferedStream = new BufferedInputStream(fileStream); + BZip2CompressorInputStream input = new BZip2CompressorInputStream(bufferedStream); - BufferedInputStream bufferedStream = new BufferedInputStream(fileStream); - BZip2CompressorInputStream input = new BZip2CompressorInputStream(bufferedStream); + return new InputStreamReader(input, encoding); - return new InputStreamReader(input, encoding); + } - } + /** + * Uncompress bz2 file + * + * @param path + * path to file to uncompress + * @throws IOException + */ + public void decompress(String path) throws IOException + { + File bzip2 = new File(path); - /** - * Uncompress bz2 file - * - * @param path path to file to uncompress - * @throws IOException - */ - public void decompress(String path) - throws IOException { - File bzip2 = new File(path); + // + File unarchived = new File(bzip2.getName().replace(".bz2", "")); - // - File unarchived = new File(bzip2.getName().replace(".bz2", "")); + unarchived.createNewFile(); - unarchived.createNewFile(); + BufferedInputStream inputStr = new BufferedInputStream(new FileInputStream(bzip2)); - BufferedInputStream inputStr = new BufferedInputStream(new FileInputStream(bzip2)); + // read bzip2 prefix + inputStr.read(); + inputStr.read(); - // read bzip2 prefix - inputStr.read(); - inputStr.read(); + BufferedInputStream buffStr = new BufferedInputStream(inputStr); - BufferedInputStream buffStr = new BufferedInputStream(inputStr); + BZip2CompressorInputStream input = new BZip2CompressorInputStream(buffStr); - BZip2CompressorInputStream input = new BZip2CompressorInputStream(buffStr); + FileOutputStream outStr = new FileOutputStream(unarchived); - FileOutputStream outStr = new FileOutputStream(unarchived); + while (true) { + byte[] compressedBytes = new byte[DECOMPRESSION_CACHE]; - while (true) { - byte[] compressedBytes = new byte[DECOMPRESSION_CACHE]; + int byteRead = input.read(compressedBytes); - int byteRead = input.read(compressedBytes); + outStr.write(compressedBytes, 0, byteRead); + if (byteRead != DECOMPRESSION_CACHE) { + break; + } + } - outStr.write(compressedBytes, 0, byteRead); - if (byteRead != DECOMPRESSION_CACHE) { - break; - } + input.close(); + buffStr.close(); + inputStr.close(); + outStr.close(); } - input.close(); - buffStr.close(); - inputStr.close(); - outStr.close(); - } - } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java index 153fb23d..3b0da848 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ArticleReaderException.java @@ -18,37 +18,45 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * ArticleReaderException Describes an exception that occurred while reading the - * articles. + * ArticleReaderException Describes an exception that occurred while reading the articles. */ @SuppressWarnings("serial") -public class ArticleReaderException extends Exception { +public class ArticleReaderException + extends Exception +{ - /** - * Creates a new ArticleReaderException. - * - * @param description message - */ - public ArticleReaderException(final String description) { - super(description); - } + /** + * Creates a new ArticleReaderException. + * + * @param description + * message + */ + public ArticleReaderException(final String description) + { + super(description); + } - /** - * Creates a new ArticleReaderException. - * - * @param e inner exception - */ - public ArticleReaderException(final Exception e) { - super(e); - } + /** + * Creates a new ArticleReaderException. + * + * @param e + * inner exception + */ + public ArticleReaderException(final Exception e) + { + super(e); + } - /** - * Creates a new ArticleReaderException. - * - * @param description message - * @param e inner exception - */ - public ArticleReaderException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new ArticleReaderException. + * + * @param description + * message + * @param e + * inner exception + */ + public ArticleReaderException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java index a9626bbe..8958a73f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ConfigurationException.java @@ -18,62 +18,74 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * ConfigurationException Describes an exception that occurred while accessing - * the configuration. + * ConfigurationException Describes an exception that occurred while accessing the configuration. */ @SuppressWarnings("serial") -public class ConfigurationException extends Exception { +public class ConfigurationException + extends Exception +{ - /** - * Reference to the error key - */ - private ErrorKeys key; + /** + * Reference to the error key + */ + private ErrorKeys key; - /** - * Creates a new ConfigurationException. - * - * @param description message - */ - public ConfigurationException(final String description) { - super(description); - } + /** + * Creates a new ConfigurationException. + * + * @param description + * message + */ + public ConfigurationException(final String description) + { + super(description); + } - /** - * Creates a new ConfigurationException. - * - * @param e inner exception - */ - public ConfigurationException(final Exception e) { - super(e); - } + /** + * Creates a new ConfigurationException. + * + * @param e + * inner exception + */ + public ConfigurationException(final Exception e) + { + super(e); + } - /** - * Creates a new ConfigurationException. - * - * @param description message - * @param e inner exception - */ - public ConfigurationException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new ConfigurationException. + * + * @param description + * message + * @param e + * inner exception + */ + public ConfigurationException(final String description, final Exception e) + { + super(description, e); + } - /** - * Creates a new ConfigurationException. - * - * @param key error key - * @param description message - */ - public ConfigurationException(final ErrorKeys key, final String description) { - super(description); - this.key = key; - } + /** + * Creates a new ConfigurationException. + * + * @param key + * error key + * @param description + * message + */ + public ConfigurationException(final ErrorKeys key, final String description) + { + super(description); + this.key = key; + } - /** - * Returns the error key. - * - * @return error key - */ - public ErrorKeys getKey() { - return this.key; - } + /** + * Returns the error key. + * + * @return error key + */ + public ErrorKeys getKey() + { + return this.key; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java index c756daff..3f911de9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DecodingException.java @@ -18,37 +18,45 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * DecodingException Describes an exception that occurred while decoding the - * diff information. + * DecodingException Describes an exception that occurred while decoding the diff information. */ @SuppressWarnings("serial") -public class DecodingException extends Exception { +public class DecodingException + extends Exception +{ - /** - * Creates a new DecodingException. - * - * @param description message - */ - public DecodingException(final String description) { - super(description); - } + /** + * Creates a new DecodingException. + * + * @param description + * message + */ + public DecodingException(final String description) + { + super(description); + } - /** - * Creates a new DecodingException. - * - * @param e inner exception - */ - public DecodingException(final Exception e) { - super(e); - } + /** + * Creates a new DecodingException. + * + * @param e + * inner exception + */ + public DecodingException(final Exception e) + { + super(e); + } - /** - * Creates a new DecodingException. - * - * @param description message - * @param e inner exception - */ - public DecodingException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new DecodingException. + * + * @param description + * message + * @param e + * inner exception + */ + public DecodingException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java index 145c44db..641e9dda 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/DiffException.java @@ -18,37 +18,45 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * DiffException Describes an exception that occurred while calculating the - * diff. + * DiffException Describes an exception that occurred while calculating the diff. */ @SuppressWarnings("serial") -public class DiffException extends Exception { +public class DiffException + extends Exception +{ - /** - * Creates a new DiffException. - * - * @param description message - */ - public DiffException(final String description) { - super(description); - } + /** + * Creates a new DiffException. + * + * @param description + * message + */ + public DiffException(final String description) + { + super(description); + } - /** - * Creates a new DiffException. - * - * @param e inner exception - */ - public DiffException(final Exception e) { - super(e); - } + /** + * Creates a new DiffException. + * + * @param e + * inner exception + */ + public DiffException(final Exception e) + { + super(e); + } - /** - * Creates a new DiffException. - * - * @param description message - * @param e inner exception - */ - public DiffException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new DiffException. + * + * @param description + * message + * @param e + * inner exception + */ + public DiffException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java index 4fdab73b..47322178 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/EncodingException.java @@ -18,37 +18,45 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * DecodingException Describes an exception that occurred while encoding the - * diff information. + * DecodingException Describes an exception that occurred while encoding the diff information. */ @SuppressWarnings("serial") -public class EncodingException extends Exception { +public class EncodingException + extends Exception +{ - /** - * Creates a new EncodingException. - * - * @param description message - */ - public EncodingException(final String description) { - super(description); - } + /** + * Creates a new EncodingException. + * + * @param description + * message + */ + public EncodingException(final String description) + { + super(description); + } - /** - * Creates a new EncodingException. - * - * @param e inner exception - */ - public EncodingException(final Exception e) { - super(e); - } + /** + * Creates a new EncodingException. + * + * @param e + * inner exception + */ + public EncodingException(final Exception e) + { + super(e); + } - /** - * Creates a new EncodingException. - * - * @param description message - * @param e inner exception - */ - public EncodingException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new EncodingException. + * + * @param description + * message + * @param e + * inner exception + */ + public EncodingException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java index 064213af..b059d4ea 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorFactory.java @@ -20,251 +20,313 @@ /** * This utility class contains method two create exceptions. */ -public final class ErrorFactory { - - /** - * No object - Utility class - */ - private ErrorFactory() { - } - - /* - * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a RuntimeException object. - * - * @param errorId reference to the error identifier - * @return RuntimeException - */ - public static RuntimeException createRuntimeException(final ErrorKeys errorId) { - return new RuntimeException(errorId.toString()); - } - - /* - * +ARTICLE+READER+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a ArticleReaderException object. - * - * @param errorId reference to the error identifier - * @return ArticleReaderException - */ - public static ArticleReaderException createArticleReaderException(final ErrorKeys errorId) { - return new ArticleReaderException(errorId.toString()); - } - - /* - * +CONFIGURATION+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a ConfigurationException object. - * - * @param errorId reference to the error identifier - * @return ConfigurationException - */ - public static ConfigurationException createConfigurationException(final ErrorKeys errorId) { - return new ConfigurationException(errorId.toString()); - } - - /** - * Creates a ConfigurationException object. - * - * @param errorId reference to the error identifier - * @param message additional error message - * @return ConfigurationException - */ - public static ConfigurationException createConfigurationException(final ErrorKeys errorId, final String message) { - return new ConfigurationException(errorId.toString() + ":\r\n" + message); - } - - /* - * +TIMEOUT+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a TimeoutException object. - * - * @param errorId reference to the error identifier - * @param sleepPeriod time value - * @return TimeoutException - */ - public static TimeoutException createTimeoutException(final ErrorKeys errorId, final long sleepPeriod) { - - return new TimeoutException(errorId.toString() + "\r\n" + "Timeout after " + sleepPeriod + " miliseconds."); - } - - /* - * +LOGGING+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a LoggingException object. - * - * @param errorId reference to the error identifier - * @return LoggingException - */ - public static LoggingException createLoggingException(final ErrorKeys errorId) { - return new LoggingException(errorId.toString()); - } - - /** - * Creates a LoggingException object. - * - * @param errorId reference to the error identifier - * @param e inner exception - * @return LoggingException - */ - public static LoggingException createLoggingException(final ErrorKeys errorId, final Exception e) { - return new LoggingException(errorId.toString(), e); - } - - /* - * +DIFF+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a DiffException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @return DiffException - */ - public static DiffException createDiffException(final ErrorKeys errorId, final String message) { - return new DiffException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates a DiffException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @param e inner exception - * @return DiffException - */ - public static DiffException createDiffException(final ErrorKeys errorId, final String message, final Exception e) { - return new DiffException(errorId.toString() + ":\r\n" + message, e); - } - - /* - * +ENCODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates an EncodingException object. - * - * @param errorId reference to the error identifier - * @return EncodingException - */ - public static EncodingException createEncodingException(final ErrorKeys errorId) { - return new EncodingException(errorId.toString()); - } - - /** - * Creates an EncodingException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @return EncodingException - */ - public static EncodingException createEncodingException(final ErrorKeys errorId, final String message) { - return new EncodingException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates an EncodingException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @param e inner exception - * @return EncodingException - */ - public static EncodingException createEncodingException( - final ErrorKeys errorId, final String message, final Exception e) { - - return new EncodingException(errorId.toString() + ":\r\n" + message, e); - } - - /* - * +DECODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a DecodingException object. - * - * @param errorId reference to the error identifier - * @return DecodingException - */ - public static DecodingException createDecodingException(final ErrorKeys errorId) { - - return new DecodingException(errorId.toString()); - } - - /** - * Creates a DecodingException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @return DecodingException - */ - public static DecodingException createDecodingException(final ErrorKeys errorId, final String message) { - - return new DecodingException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates a DecodingException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @param e inner exception - * @return DecodingException - */ - public static DecodingException createDecodingException(final ErrorKeys errorId, final String message, - final Exception e) { - return new DecodingException(errorId.toString() + ":\r\n" + message, e); - } - - /* - * +UNCOMPRESSED+CONSUMER+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++ - * + - */ - - /** - * Creates a SQLConsumerException object. - * - * @param errorId reference to the error identifier - * @param e inner exception - * @return SQLConsumerException - */ - public static SQLConsumerException createSQLConsumerException(final ErrorKeys errorId, final Exception e) { - return new SQLConsumerException(errorId.toString(), e); - } - - /** - * Creates a SQLConsumerException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @return SQLConsumerException - */ - public static SQLConsumerException createSQLConsumerException(final ErrorKeys errorId, final String message) { - return new SQLConsumerException(errorId.toString() + ":\r\n" + message); - } - - /** - * Creates a SQLConsumerException object. - * - * @param errorId reference to the error identifier - * @param message additional message - * @param e inner exception - * @return SQLConsumerException - */ - public static SQLConsumerException createSQLConsumerException(final ErrorKeys errorId, final String message, - final Exception e) { - return new SQLConsumerException(errorId.toString() + ":\r\n" + message, e); - } +public final class ErrorFactory +{ + + /** + * No object - Utility class + */ + private ErrorFactory() + { + } + + /* + * ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a RuntimeException object. + * + * @param errorId + * reference to the error identifier + * @return RuntimeException + */ + public static RuntimeException createRuntimeException(final ErrorKeys errorId) + { + return new RuntimeException(errorId.toString()); + } + + /* + * +ARTICLE+READER+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a ArticleReaderException object. + * + * @param errorId + * reference to the error identifier + * @return ArticleReaderException + */ + public static ArticleReaderException createArticleReaderException(final ErrorKeys errorId) + { + return new ArticleReaderException(errorId.toString()); + } + + /* + * +CONFIGURATION+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a ConfigurationException object. + * + * @param errorId + * reference to the error identifier + * @return ConfigurationException + */ + public static ConfigurationException createConfigurationException(final ErrorKeys errorId) + { + return new ConfigurationException(errorId.toString()); + } + + /** + * Creates a ConfigurationException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional error message + * @return ConfigurationException + */ + public static ConfigurationException createConfigurationException(final ErrorKeys errorId, + final String message) + { + return new ConfigurationException(errorId.toString() + ":\r\n" + message); + } + + /* + * +TIMEOUT+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a TimeoutException object. + * + * @param errorId + * reference to the error identifier + * @param sleepPeriod + * time value + * @return TimeoutException + */ + public static TimeoutException createTimeoutException(final ErrorKeys errorId, + final long sleepPeriod) + { + + return new TimeoutException( + errorId.toString() + "\r\n" + "Timeout after " + sleepPeriod + " miliseconds."); + } + + /* + * +LOGGING+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a LoggingException object. + * + * @param errorId + * reference to the error identifier + * @return LoggingException + */ + public static LoggingException createLoggingException(final ErrorKeys errorId) + { + return new LoggingException(errorId.toString()); + } + + /** + * Creates a LoggingException object. + * + * @param errorId + * reference to the error identifier + * @param e + * inner exception + * @return LoggingException + */ + public static LoggingException createLoggingException(final ErrorKeys errorId, + final Exception e) + { + return new LoggingException(errorId.toString(), e); + } + + /* + * +DIFF+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a DiffException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @return DiffException + */ + public static DiffException createDiffException(final ErrorKeys errorId, final String message) + { + return new DiffException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates a DiffException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @param e + * inner exception + * @return DiffException + */ + public static DiffException createDiffException(final ErrorKeys errorId, final String message, + final Exception e) + { + return new DiffException(errorId.toString() + ":\r\n" + message, e); + } + + /* + * +ENCODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates an EncodingException object. + * + * @param errorId + * reference to the error identifier + * @return EncodingException + */ + public static EncodingException createEncodingException(final ErrorKeys errorId) + { + return new EncodingException(errorId.toString()); + } + + /** + * Creates an EncodingException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @return EncodingException + */ + public static EncodingException createEncodingException(final ErrorKeys errorId, + final String message) + { + return new EncodingException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates an EncodingException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @param e + * inner exception + * @return EncodingException + */ + public static EncodingException createEncodingException(final ErrorKeys errorId, + final String message, final Exception e) + { + + return new EncodingException(errorId.toString() + ":\r\n" + message, e); + } + + /* + * +DECODING+EXCEPTION+++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a DecodingException object. + * + * @param errorId + * reference to the error identifier + * @return DecodingException + */ + public static DecodingException createDecodingException(final ErrorKeys errorId) + { + + return new DecodingException(errorId.toString()); + } + + /** + * Creates a DecodingException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @return DecodingException + */ + public static DecodingException createDecodingException(final ErrorKeys errorId, + final String message) + { + + return new DecodingException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates a DecodingException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @param e + * inner exception + * @return DecodingException + */ + public static DecodingException createDecodingException(final ErrorKeys errorId, + final String message, final Exception e) + { + return new DecodingException(errorId.toString() + ":\r\n" + message, e); + } + + /* + * +UNCOMPRESSED+CONSUMER+EXCEPTION++++++++++++++++++++++++++++++++++++++++++++++++++++ + + */ + + /** + * Creates a SQLConsumerException object. + * + * @param errorId + * reference to the error identifier + * @param e + * inner exception + * @return SQLConsumerException + */ + public static SQLConsumerException createSQLConsumerException(final ErrorKeys errorId, + final Exception e) + { + return new SQLConsumerException(errorId.toString(), e); + } + + /** + * Creates a SQLConsumerException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @return SQLConsumerException + */ + public static SQLConsumerException createSQLConsumerException(final ErrorKeys errorId, + final String message) + { + return new SQLConsumerException(errorId.toString() + ":\r\n" + message); + } + + /** + * Creates a SQLConsumerException object. + * + * @param errorId + * reference to the error identifier + * @param message + * additional message + * @param e + * inner exception + * @return SQLConsumerException + */ + public static SQLConsumerException createSQLConsumerException(final ErrorKeys errorId, + final String message, final Exception e) + { + return new SQLConsumerException(errorId.toString() + ":\r\n" + message, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java index b9360a0d..d3e50832 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/ErrorKeys.java @@ -20,55 +20,61 @@ /** * This class contains an enumeration of the possible error sources. */ -public enum ErrorKeys { +public enum ErrorKeys +{ - /** - * The configuration manager has not been created - */ - CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED, + /** + * The configuration manager has not been created + */ + CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED, - /** - * An unknown configuration parameter was requested - */ - CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, + /** + * An unknown configuration parameter was requested + */ + CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, - /** - * An undefined parameter was requested - */ - CONFIGURATION_PARAMETER_UNDEFINED, + /** + * An undefined parameter was requested + */ + CONFIGURATION_PARAMETER_UNDEFINED, - /** - * An IOException occurred while parsing the xml input - */ - DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_IOEXCEPTION, + /** + * An IOException occurred while parsing the xml input + */ + DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_IOEXCEPTION, - /** - * An keyword was found were it was not supposed to be - */ - DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD, + /** + * An keyword was found were it was not supposed to be + */ + DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD, - /** - * The end of the file was reached, but the parsing process was not finished - */ - DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE, + /** + * The end of the file was reached, but the parsing process was not finished + */ + DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE, - DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE, + DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE, - DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_OUT_OF_RANGE, DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_AFTER_END_OF_STREAM, + DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_OUT_OF_RANGE, + DELTA_CONSUMERS_SQL_CODEC_BITREADER_READ_OPERATION_AFTER_END_OF_STREAM, - DELTA_CONSUMERS_SQL_CODEC_BITWRITER_WRITE_OPERATOR_OUT_OF_RANGE, DELTA_CONSUMERS_SQL_CODEC_BITWRITER_INVALID_WRITE_OPERATION, + DELTA_CONSUMERS_SQL_CODEC_BITWRITER_WRITE_OPERATOR_OUT_OF_RANGE, + DELTA_CONSUMERS_SQL_CODEC_BITWRITER_INVALID_WRITE_OPERATION, - DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE, + DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE, - DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, + DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, - DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, + DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, + DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, - DIFFTOOL_ENCODING_INVALID_VALUE, DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + DIFFTOOL_ENCODING_INVALID_VALUE, DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - DIFFTOOL_DECODING_INVALID_VALUE, DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM, + DIFFTOOL_DECODING_INVALID_VALUE, DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, + DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM, - LOGGING_LOGGER_INITIALIZISATION_FAILED, LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER, LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST, + LOGGING_LOGGER_INITIALIZISATION_FAILED, LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER, + LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST, - ABSTRACT_CONSUMER_TIMEOUT + ABSTRACT_CONSUMER_TIMEOUT } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java index 01cd6d6f..ad7eba0d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/LoggingException.java @@ -21,33 +21,42 @@ * LoggingException Describes an exception that occurred during the logging. */ @SuppressWarnings("serial") -public class LoggingException extends Exception { +public class LoggingException + extends Exception +{ - /** - * Creates a new LoggingException. - * - * @param description message - */ - public LoggingException(final String description) { - super(description); - } + /** + * Creates a new LoggingException. + * + * @param description + * message + */ + public LoggingException(final String description) + { + super(description); + } - /** - * Creates a new LoggingException. - * - * @param e inner exception - */ - public LoggingException(final Exception e) { - super(e); - } + /** + * Creates a new LoggingException. + * + * @param e + * inner exception + */ + public LoggingException(final Exception e) + { + super(e); + } - /** - * Creates a new LoggingException. - * - * @param description message - * @param e inner exception - */ - public LoggingException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new LoggingException. + * + * @param description + * message + * @param e + * inner exception + */ + public LoggingException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java index 6a0de717..5d121bd8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/SQLConsumerException.java @@ -18,38 +18,45 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * SQLConsumerException Describes an exception that occurred while accessing the - * database. + * SQLConsumerException Describes an exception that occurred while accessing the database. */ @SuppressWarnings("serial") public class SQLConsumerException - extends Exception { + extends Exception +{ - /** - * Creates a new SQLConsumerException. - * - * @param description message - */ - public SQLConsumerException(final String description) { - super(description); - } + /** + * Creates a new SQLConsumerException. + * + * @param description + * message + */ + public SQLConsumerException(final String description) + { + super(description); + } - /** - * Creates a new SQLConsumerException. - * - * @param e inner exception - */ - public SQLConsumerException(final Exception e) { - super(e); - } + /** + * Creates a new SQLConsumerException. + * + * @param e + * inner exception + */ + public SQLConsumerException(final Exception e) + { + super(e); + } - /** - * Creates a new SQLConsumerException. - * - * @param description message - * @param e inner exception - */ - public SQLConsumerException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new SQLConsumerException. + * + * @param description + * message + * @param e + * inner exception + */ + public SQLConsumerException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java index 12097f50..c057ecce 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/exceptions/TimeoutException.java @@ -18,38 +18,45 @@ package org.dkpro.jwpl.revisionmachine.common.exceptions; /** - * TimeoutException Describes an exception that occurred because of a timeout - * event. + * TimeoutException Describes an exception that occurred because of a timeout event. */ @SuppressWarnings("serial") public class TimeoutException - extends Exception { + extends Exception +{ - /** - * Creates a new TimeoutException. - * - * @param description message - */ - public TimeoutException(final String description) { - super(description); - } + /** + * Creates a new TimeoutException. + * + * @param description + * message + */ + public TimeoutException(final String description) + { + super(description); + } - /** - * Creates a new TimeoutException. - * - * @param e inner exception - */ - public TimeoutException(final Exception e) { - super(e); - } + /** + * Creates a new TimeoutException. + * + * @param e + * inner exception + */ + public TimeoutException(final Exception e) + { + super(e); + } - /** - * Creates a new TimeoutException. - * - * @param description message - * @param e inner exception - */ - public TimeoutException(final String description, final Exception e) { - super(description, e); - } + /** + * Creates a new TimeoutException. + * + * @param description + * message + * @param e + * inner exception + */ + public TimeoutException(final String description, final Exception e) + { + super(description, e); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java index b056d1a5..e5249488 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/Logger.java @@ -30,230 +30,255 @@ /** * DiffTool Logger class */ -public class Logger { - - /** - * Name of the logger - */ - private final String consumerName; - - /** - * Reference to level of the logging - */ - private final Level logLevel; - - /** - * Type of the logger - */ - private final LoggerType type; - - /** - * Reference to the output writer - */ - private final FileWriter writer; - - /** - * Creates a new logger. - * - * @param type type - * @param consumerName name - * @throws LoggingException if an error occurred - */ - public Logger(final LoggerType type, final String consumerName) - throws LoggingException { - - try { - this.type = type; - this.consumerName = consumerName; - - ConfigurationManager config = ConfigurationManager.getInstance(); - String path = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - - switch (type) { - case ARTICLE_OUTPUT: - logLevel = Level.INFO; - break; - case DIFF_TOOL_ERROR: - logLevel = Level.ERROR; - break; - case DIFF_TOOL: - logLevel = (Level) config - .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); - break; - default: - throw ErrorFactory - .createLoggingException(ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED); - } - - this.writer = new FileWriter(path + consumerName + ".log"); - - } catch (Exception e) { - throw ErrorFactory.createLoggingException( - ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED, e); - } - } - - /** - * Closes the output writer. - */ - public synchronized void close() { - try { - writer.close(); - } catch (IOException ioe) { - ioe.printStackTrace(); - } - } - - /** - * Flushes the buffered output of the writer to the file. - */ - public synchronized void flush() { - try { - writer.flush(); - } catch (IOException ioe) { - ioe.printStackTrace(); - } - } - - /** - * Returns the log level. - * - * @return log level - */ - public Level getLogLevel() { - return this.logLevel; - } - - /** - * Writes the given text to the output file. - * - * @param text log message - */ - private synchronized void log(final String text) { - - try { - this.writer.write(text); - } catch (IOException ioe) { - ioe.printStackTrace(); +public class Logger +{ + + /** + * Name of the logger + */ + private final String consumerName; + + /** + * Reference to level of the logging + */ + private final Level logLevel; + + /** + * Type of the logger + */ + private final LoggerType type; + + /** + * Reference to the output writer + */ + private final FileWriter writer; + + /** + * Creates a new logger. + * + * @param type + * type + * @param consumerName + * name + * @throws LoggingException + * if an error occurred + */ + public Logger(final LoggerType type, final String consumerName) throws LoggingException + { + + try { + this.type = type; + this.consumerName = consumerName; + + ConfigurationManager config = ConfigurationManager.getInstance(); + String path = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + + switch (type) { + case ARTICLE_OUTPUT: + logLevel = Level.INFO; + break; + case DIFF_TOOL_ERROR: + logLevel = Level.ERROR; + break; + case DIFF_TOOL: + logLevel = (Level) config + .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); + break; + default: + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED); + } + + this.writer = new FileWriter(path + consumerName + ".log"); + + } + catch (Exception e) { + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGER_INITIALIZISATION_FAILED, e); + } } - } - - /** - * The occurred error with the related log level and message has to be given - * to this method. - * <p> - * This method will verify if the message should be logged or not. - * - * @param level log level - * @param message message - * @param e Error - */ - public void logError(final Level level, final String message, final Error e) { - try { - Logger errors = LoggingFactory - .getLogger(LoggingFactory.NAME_ERROR_LOGGER); - - errors.logThrowable(level, message, e); - - } catch (LoggingException ex) { - ex.printStackTrace(); + + /** + * Closes the output writer. + */ + public synchronized void close() + { + try { + writer.close(); + } + catch (IOException ioe) { + ioe.printStackTrace(); + } } - if (logLevel.toInt() > level.toInt()) { - return; + /** + * Flushes the buffered output of the writer to the file. + */ + public synchronized void flush() + { + try { + writer.flush(); + } + catch (IOException ioe) { + ioe.printStackTrace(); + } } - logThrowable(level, message, e); - } - - /** - * The occurred exception with the related log level and message has to be - * given to this method. - * <p> - * This method will verify if the message should be logged or not. - * - * @param level log level - * @param message message - * @param e Exception - */ - public void logException(final Level level, final String message, - final Exception e) { - - try { - Logger errors = LoggingFactory - .getLogger(LoggingFactory.NAME_ERROR_LOGGER); - - errors.logThrowable(level, message, e); - - } catch (LoggingException ex) { - ex.printStackTrace(); + /** + * Returns the log level. + * + * @return log level + */ + public Level getLogLevel() + { + return this.logLevel; } - if (logLevel.toInt() > level.toInt()) { - return; + /** + * Writes the given text to the output file. + * + * @param text + * log message + */ + private synchronized void log(final String text) + { + + try { + this.writer.write(text); + } + catch (IOException ioe) { + ioe.printStackTrace(); + } } - logThrowable(level, message, e); - } - - /** - * This method will be called with a message and the related log level. It - * be verified if the message should be logged or not. - * <p> - * The format of the logged message is: \t consumerName [ Type of Logger ] - * \t message \r\n - * - * @param level level - * @param message message - */ - public synchronized void logMessage(final Level level, final String message) { - - if (logLevel.toInt() > level.toInt()) { - return; + /** + * The occurred error with the related log level and message has to be given to this method. + * <p> + * This method will verify if the message should be logged or not. + * + * @param level + * log level + * @param message + * message + * @param e + * Error + */ + public void logError(final Level level, final String message, final Error e) + { + try { + Logger errors = LoggingFactory.getLogger(LoggingFactory.NAME_ERROR_LOGGER); + + errors.logThrowable(level, message, e); + + } + catch (LoggingException ex) { + ex.printStackTrace(); + } + + if (logLevel.toInt() > level.toInt()) { + return; + } + + logThrowable(level, message, e); } - try { - this.writer.write(System.currentTimeMillis() + "\t" + consumerName - + " [" + type.toString() + "] " + "\t" + message + "\r\n"); - this.writer.flush(); - } catch (IOException ioe) { - ioe.printStackTrace(); + /** + * The occurred exception with the related log level and message has to be given to this method. + * <p> + * This method will verify if the message should be logged or not. + * + * @param level + * log level + * @param message + * message + * @param e + * Exception + */ + public void logException(final Level level, final String message, final Exception e) + { + + try { + Logger errors = LoggingFactory.getLogger(LoggingFactory.NAME_ERROR_LOGGER); + + errors.logThrowable(level, message, e); + + } + catch (LoggingException ex) { + ex.printStackTrace(); + } + + if (logLevel.toInt() > level.toInt()) { + return; + } + + logThrowable(level, message, e); } - } - - /** - * The occurred error or exception with the related log level and message - * will be logged by this method. - * - * @param level log level - * @param message message - * @param t Throwable - */ - private synchronized void logThrowable(final Level level, - final String message, final Throwable t) { - - if (t != null) { - log("\r\n[" + System.currentTimeMillis() + "]\t" + message); - log("\r\n" + t); - log("\r\n"); - - for (StackTraceElement st : t.getStackTrace()) { - log("\t" + st.toString() + "\r\n"); - } - - Throwable c = t.getCause(); - if (c != null) { - - log("Caused by:\t" + c + "\r\n"); - - for (StackTraceElement st : c.getStackTrace()) { - log("\t" + st.toString() + "\r\n"); + + /** + * This method will be called with a message and the related log level. It be verified if the + * message should be logged or not. + * <p> + * The format of the logged message is: \t consumerName [ Type of Logger ] \t message \r\n + * + * @param level + * level + * @param message + * message + */ + public synchronized void logMessage(final Level level, final String message) + { + + if (logLevel.toInt() > level.toInt()) { + return; } - } - log("\r\n"); - this.flush(); + try { + this.writer.write(System.currentTimeMillis() + "\t" + consumerName + " [" + + type.toString() + "] " + "\t" + message + "\r\n"); + this.writer.flush(); + } + catch (IOException ioe) { + ioe.printStackTrace(); + } + } + + /** + * The occurred error or exception with the related log level and message will be logged by this + * method. + * + * @param level + * log level + * @param message + * message + * @param t + * Throwable + */ + private synchronized void logThrowable(final Level level, final String message, + final Throwable t) + { + + if (t != null) { + log("\r\n[" + System.currentTimeMillis() + "]\t" + message); + log("\r\n" + t); + log("\r\n"); + + for (StackTraceElement st : t.getStackTrace()) { + log("\t" + st.toString() + "\r\n"); + } + + Throwable c = t.getCause(); + if (c != null) { + + log("Caused by:\t" + c + "\r\n"); + + for (StackTraceElement st : c.getStackTrace()) { + log("\t" + st.toString() + "\r\n"); + } + } + + log("\r\n"); + this.flush(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java index e62dea34..4cb8d98c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggerType.java @@ -20,55 +20,56 @@ /** * This class contains all keys for diff tool loggers. */ -public enum LoggerType { +public enum LoggerType +{ - /** - * DiffTool Error Logger - */ - DIFF_TOOL_ERROR, + /** + * DiffTool Error Logger + */ + DIFF_TOOL_ERROR, - /** - * DiffTool Logger - */ - DIFF_TOOL, + /** + * DiffTool Logger + */ + DIFF_TOOL, - /** - * Article Output Logger - */ - ARTICLE_OUTPUT, + /** + * Article Output Logger + */ + ARTICLE_OUTPUT, - /** - * UNCOMPRESSED Consumer Logger - */ - CONSUMER_SQL, + /** + * UNCOMPRESSED Consumer Logger + */ + CONSUMER_SQL, - /** - * Diff Consumer Logger - */ - CONSUMER_DIFF, + /** + * Diff Consumer Logger + */ + CONSUMER_DIFF, - /** - * Task Consumer Logger - */ - CONSUMER_TASK, + /** + * Task Consumer Logger + */ + CONSUMER_TASK, - /** - * Artcile Producer Logger - */ - PRODUCER_ARTICLES, + /** + * Artcile Producer Logger + */ + PRODUCER_ARTICLES, - /** - * Producer Archives Logger - */ - PRODUCER_ARCHIVES, + /** + * Producer Archives Logger + */ + PRODUCER_ARCHIVES, - /** - * Diff Producer Logger - */ - PRODUCER_DIFFS, + /** + * Diff Producer Logger + */ + PRODUCER_DIFFS, - /** - * Consumer Producer Logger - */ - PRODUCER_CONSUMERS + /** + * Consumer Producer Logger + */ + PRODUCER_CONSUMERS } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java index 1568b1ac..a48e288f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/LoggingFactory.java @@ -26,77 +26,83 @@ /** * The static references in this 'class' creates and controls all loggers. */ -public class LoggingFactory { +public class LoggingFactory +{ - /** - * Reference Map Consumer(-Name) -> Logger - */ - private static final HashMap<String, Logger> consumerLoggingIndex; + /** + * Reference Map Consumer(-Name) -> Logger + */ + private static final HashMap<String, Logger> consumerLoggingIndex; - /** - * Name for the DiffTool Output Logger - */ - public final static String NAME_ARTICLE_OUTPUT_LOGGER = "DiffToolOutput"; + /** + * Name for the DiffTool Output Logger + */ + public final static String NAME_ARTICLE_OUTPUT_LOGGER = "DiffToolOutput"; - /** - * Name for the DiffTool Error Logger - */ - public final static String NAME_ERROR_LOGGER = "DiffToolErrors"; + /** + * Name for the DiffTool Error Logger + */ + public final static String NAME_ERROR_LOGGER = "DiffToolErrors"; - /* Creates the static logging factory components */ - static { - consumerLoggingIndex = new HashMap<>(); + /* Creates the static logging factory components */ + static { + consumerLoggingIndex = new HashMap<>(); - try { - createLogger(LoggerType.DIFF_TOOL_ERROR, NAME_ERROR_LOGGER); - createLogger(LoggerType.ARTICLE_OUTPUT, NAME_ARTICLE_OUTPUT_LOGGER); - } catch (LoggingException e) { - e.printStackTrace(); - System.exit(-1); + try { + createLogger(LoggerType.DIFF_TOOL_ERROR, NAME_ERROR_LOGGER); + createLogger(LoggerType.ARTICLE_OUTPUT, NAME_ARTICLE_OUTPUT_LOGGER); + } + catch (LoggingException e) { + e.printStackTrace(); + System.exit(-1); + } } - } - /** - * No class - */ - private LoggingFactory() { - } + /** + * No class + */ + private LoggingFactory() + { + } + + /** + * Creates a new Logger. + * + * @param consumerName + * Consumer Name + * @return The referenced Logger + * @throws LoggingException + */ + public static Logger createLogger(final LoggerType type, final String consumerName) + throws LoggingException + { - /** - * Creates a new Logger. - * - * @param consumerName Consumer Name - * @return The referenced Logger - * @throws LoggingException - */ - public static Logger createLogger(final LoggerType type, final String consumerName) - throws LoggingException { + Logger log = new Logger(type, consumerName); + if (consumerLoggingIndex.put(consumerName, log) != null) { + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST); + } - Logger log = new Logger(type, consumerName); - if (consumerLoggingIndex.put(consumerName, log) != null) { - throw ErrorFactory - .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_LOGGER_ALREADY_EXIST); + return log; } - return log; - } + /** + * Returns an already created Logger. + * + * @param consumerName + * Consumer Name + * @return The referenced Logger + * @throws LoggingException + */ + public static Logger getLogger(final String consumerName) throws LoggingException + { - /** - * Returns an already created Logger. - * - * @param consumerName Consumer Name - * @return The referenced Logger - * @throws LoggingException - */ - public static Logger getLogger(final String consumerName) - throws LoggingException { + Logger log = consumerLoggingIndex.get(consumerName); + if (log == null) { + throw ErrorFactory + .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER); + } - Logger log = consumerLoggingIndex.get(consumerName); - if (log == null) { - throw ErrorFactory - .createLoggingException(ErrorKeys.LOGGING_LOGGINGFACTORY_NO_SUCH_LOGGER); + return log; } - - return log; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java index 40f2dfa7..c97c4f89 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/DiffToolLogMessages.java @@ -24,82 +24,102 @@ /** * This class contains the english localized log messages for DiffTool. * <p> + * * @deprecated To be removed without replacement. */ // TODO: This file should be replaced with resource files. @Deprecated(since = "1.1", forRemoval = true) -public class DiffToolLogMessages { +public class DiffToolLogMessages +{ - /** - * No object - utility class - */ - private DiffToolLogMessages() { - } + /** + * No object - utility class + */ + private DiffToolLogMessages() + { + } - /** - * Logs the start of the diff tool. - * - * @param logger reference to the logger - */ - public static void logInitialization(final Logger logger) { - logger.logMessage(Level.INFO, "DiffTool initialized [LogLevel: " - + logger.getLogLevel() + "]"); - } + /** + * Logs the start of the diff tool. + * + * @param logger + * reference to the logger + */ + public static void logInitialization(final Logger logger) + { + logger.logMessage(Level.INFO, + "DiffTool initialized [LogLevel: " + logger.getLogLevel() + "]"); + } - /** - * Logs the status of the diff tool. - * - * @param logger reference to the logger - * @param time time since start - * @param articleConsumer number of active article consumers - * @param diffConsumer number of active diff consumers - * @param sqlConsumer number of active sql consumers - * @param archiveState state of the arcive producer - * @param articleState state of the article producer - * @param diffState state of the diff producer - */ - public static void logStatus(final Logger logger, final long time, - final int articleConsumer, final int diffConsumer, - final int sqlConsumer, final boolean archiveState, - final boolean articleState, final boolean diffState) { + /** + * Logs the status of the diff tool. + * + * @param logger + * reference to the logger + * @param time + * time since start + * @param articleConsumer + * number of active article consumers + * @param diffConsumer + * number of active diff consumers + * @param sqlConsumer + * number of active sql consumers + * @param archiveState + * state of the arcive producer + * @param articleState + * state of the article producer + * @param diffState + * state of the diff producer + */ + public static void logStatus(final Logger logger, final long time, final int articleConsumer, + final int diffConsumer, final int sqlConsumer, final boolean archiveState, + final boolean articleState, final boolean diffState) + { - logger.logMessage(Level.INFO, - "\r\nDiffTool-Status-Report [" + Time.toClock(time) + "]" - + "\r\nConsumerProducer \t[" + articleConsumer + " | " - + diffConsumer + " | " + sqlConsumer + "]" - + "\r\nArchiveProducer\t" + archiveState - + "\r\nArticleProducer\t" + articleState - + "\r\nDiffProducer \t" + diffState + "\r\n"); - } + logger.logMessage(Level.INFO, + "\r\nDiffTool-Status-Report [" + Time.toClock(time) + "]" + + "\r\nConsumerProducer \t[" + articleConsumer + " | " + diffConsumer + + " | " + sqlConsumer + "]" + "\r\nArchiveProducer\t" + archiveState + + "\r\nArticleProducer\t" + articleState + "\r\nDiffProducer \t" + + diffState + "\r\n"); + } - /** - * Logs an exception. - * - * @param logger reference to the logger - * @param e reference to the exception - */ - public static void logException(final Logger logger, final Exception e) { - logger.logException(Level.ERROR, "Unexpected Exception", e); - } + /** + * Logs an exception. + * + * @param logger + * reference to the logger + * @param e + * reference to the exception + */ + public static void logException(final Logger logger, final Exception e) + { + logger.logException(Level.ERROR, "Unexpected Exception", e); + } - /** - * Logs an error. - * - * @param logger reference to the logger - * @param e reference to the error - */ - public static void logError(final Logger logger, final Error e) { - logger.logError(Level.ERROR, "Unexpected Error", e); - } + /** + * Logs an error. + * + * @param logger + * reference to the logger + * @param e + * reference to the error + */ + public static void logError(final Logger logger, final Error e) + { + logger.logError(Level.ERROR, "Unexpected Error", e); + } - /** - * Logs the shutdown of the logger. - * - * @param logger reference to the logger - * @param endTime time since start - */ - public static void logShutdown(final Logger logger, final long endTime) { - logger.logMessage(Level.INFO, - "DiffTool initiates SHUTDOWN\t" + Time.toClock(endTime)); - } + /** + * Logs the shutdown of the logger. + * + * @param logger + * reference to the logger + * @param endTime + * time since start + */ + public static void logShutdown(final Logger logger, final long endTime) + { + logger.logMessage(Level.INFO, "DiffTool initiates SHUTDOWN\t" + Time.toClock(endTime)); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java index 9c16a255..da6d8aac 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ArticleConsumerLogMessages.java @@ -31,194 +31,248 @@ /** * This class contains the english localized log messages for ArticleConsumers. * <p> + * * @deprecated To be removed without replacement. */ // TODO: This file should be replaced with resource files. @Deprecated(since = "1.1", forRemoval = true) -public final class ArticleConsumerLogMessages { - - /** - * Logs the retrieval of an archive descriptor. - * - * @param logger reference to the logger - * @param archive reference to the archive descriptor - */ - public static void logArchiveRetrieved(final Logger logger, final ArchiveDescription archive) { - - logger.logMessage(Level.INFO, "Retrieved archive " + archive.toString() + " successfully"); - } - - /** - * Logs the reading of an revision task. - * - * @param logger reference to the logger - * @param article reference to the revision task - * @param time time needed for the operation - */ - public static void logArticleRead(final Logger logger, final Task<Revision> article, final long time) { - - logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) - + "\t" + article.toString()); - } - - /** - * Logs the reading of an revision task. - * - * @param logger reference to the logger - * @param article reference to the revision task - * @param time time needed for the operation - * @param position input file position - */ - public static void logArticleRead(final Logger logger, final Task<Revision> article, - final long time, final long position) { - - logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) - + "\t" + article.toString() + "\t" + position); - } - - /** - * Logs the occurrence of an error while retrieving the input file. - * - * @param logger reference to the logger - * @param archive reference to the archive - * @param e reference to the error - */ - public static void logErrorRetrieveArchive(final Logger logger, final ArchiveDescription archive, final Error e) { - - logger.logError(Level.ERROR, "Error while accessing archive " + archive.toString(), e); - } - - /** - * Logs the occurrence of an exception while retrieving the input file. - * - * @param logger reference to the logger - * @param archive reference to the archive - * @param e reference to the exception - */ - public static void logExceptionRetrieveArchive(final Logger logger, final ArchiveDescription archive, - final Exception e) { - - logger.logException(Level.ERROR, "Exception while accessing archive " + archive.toString(), e); - } - - /** - * Logs the occurrence of an invalid task type. - * - * @param logger reference to the logger - * @param type type of task - */ - public static void logInvalidTaskType(final Logger logger, final TaskTypes type) { - - logger.logMessage(Level.INFO, "Invalid TaskType: " + type); - } - - /** - * Logs that no more archives are available. - * - * @param logger reference to the logger - */ - public static void logNoMoreArchives(final Logger logger) { - - logger.logMessage(Level.INFO, "Consumer initiates SHUTDOWN: no more archives available."); - } - - /** - * Logs that no more articles are available. - * - * @param logger reference to the logger - * @param archive reference to the archive descriptor - */ - public static void logNoMoreArticles(final Logger logger, final ArchiveDescription archive) { - - logger.logMessage(Level.INFO, "Archive " + archive.toString() + " contains no more articles"); - } - - /** - * Logs an occurrence of an exception while reading a task. - * - * @param logger reference to the logger - * @param task reference to the task - * @param e reference to the exception - */ - public static void logReadTaskException(final Logger logger, final Task<Revision> task, final Exception e) { - - if (task != null) { - logger.logException(Level.ERROR, "Error while reading a task: " + task, e); - } else { - logger.logException(Level.ERROR, "Error while reading an unknown task", e); +public final class ArticleConsumerLogMessages +{ + + /** + * Logs the retrieval of an archive descriptor. + * + * @param logger + * reference to the logger + * @param archive + * reference to the archive descriptor + */ + public static void logArchiveRetrieved(final Logger logger, final ArchiveDescription archive) + { + + logger.logMessage(Level.INFO, "Retrieved archive " + archive.toString() + " successfully"); } - } - - /** - * Logs an occurrence of an OutOfMemoryError while reading a task. - * - * @param logger reference to the logger - * @param task reference to the task - * @param e reference to the error - */ - public static void logReadTaskOutOfMemoryError(final Logger logger, - final Task<Revision> task, final OutOfMemoryError e) { - - if (task != null) { - logger.logError(Level.WARN, "Error while reading a task: " + task, e); - } else { - logger.logError(Level.WARN, "Error while reading an unknown task", e); + + /** + * Logs the reading of an revision task. + * + * @param logger + * reference to the logger + * @param article + * reference to the revision task + * @param time + * time needed for the operation + */ + public static void logArticleRead(final Logger logger, final Task<Revision> article, + final long time) + { + + logger.logMessage(Level.INFO, + "Read article\t" + Time.toClock(time) + "\t" + article.toString()); + } + + /** + * Logs the reading of an revision task. + * + * @param logger + * reference to the logger + * @param article + * reference to the revision task + * @param time + * time needed for the operation + * @param position + * input file position + */ + public static void logArticleRead(final Logger logger, final Task<Revision> article, + final long time, final long position) + { + + logger.logMessage(Level.INFO, "Read article\t" + Time.toClock(time) + "\t" + + article.toString() + "\t" + position); + } + + /** + * Logs the occurrence of an error while retrieving the input file. + * + * @param logger + * reference to the logger + * @param archive + * reference to the archive + * @param e + * reference to the error + */ + public static void logErrorRetrieveArchive(final Logger logger, + final ArchiveDescription archive, final Error e) + { + + logger.logError(Level.ERROR, "Error while accessing archive " + archive.toString(), e); + } + + /** + * Logs the occurrence of an exception while retrieving the input file. + * + * @param logger + * reference to the logger + * @param archive + * reference to the archive + * @param e + * reference to the exception + */ + public static void logExceptionRetrieveArchive(final Logger logger, + final ArchiveDescription archive, final Exception e) + { + + logger.logException(Level.ERROR, "Exception while accessing archive " + archive.toString(), + e); } - } - - /** - * Logs the failed retrieval of an archive descriptor. - * - * @param logger reference to the logger - */ - public static void logRetrieveArchiveFailed(final Logger logger) { - - logger.logMessage(Level.WARN, "Consumer failed to obtain an archive"); - } - - /** - * Logs the status of the article consumer. - * - * @param logger reference to the logger - * @param articleReader reference to the ArticleReader - * @param startTime start time - * @param sleepingTime time the consumer has slept - * @param workingTime time the consumer was working - */ - public static void logStatus(final Logger logger, - final ArticleReaderInterface articleReader, final long startTime, - final long sleepingTime, final long workingTime) { - - String message = "Consumer-Status-Report [" - + Time.toClock(System.currentTimeMillis() - startTime) + "]"; - - if (articleReader != null) { - message += "\tPOSITION <" + articleReader.getBytePosition() + ">"; + + /** + * Logs the occurrence of an invalid task type. + * + * @param logger + * reference to the logger + * @param type + * type of task + */ + public static void logInvalidTaskType(final Logger logger, final TaskTypes type) + { + + logger.logMessage(Level.INFO, "Invalid TaskType: " + type); + } + + /** + * Logs that no more archives are available. + * + * @param logger + * reference to the logger + */ + public static void logNoMoreArchives(final Logger logger) + { + + logger.logMessage(Level.INFO, "Consumer initiates SHUTDOWN: no more archives available."); + } + + /** + * Logs that no more articles are available. + * + * @param logger + * reference to the logger + * @param archive + * reference to the archive descriptor + */ + public static void logNoMoreArticles(final Logger logger, final ArchiveDescription archive) + { + + logger.logMessage(Level.INFO, + "Archive " + archive.toString() + " contains no more articles"); } - message += "\tEFFICIENCY\t " - + MathUtilities.percentPlus(workingTime, sleepingTime) - + "\tWORK [" + Time.toClock(workingTime) + "]" + "\tSLEEP [" - + Time.toClock(sleepingTime) + "]"; - - logger.logMessage(Level.DEBUG, message); - } - - /** - * Logs the occurrence of an ArticleReaderException. - * - * @param logger reference to the logger - * @param e reference to the exception - */ - public static void logTaskReaderException(final Logger logger, - final ArticleReaderException e) { - - logger.logException(Level.ERROR, "TaskReaderException", e); - } - - /** - * No object - utility class - */ - private ArticleConsumerLogMessages() { - } + /** + * Logs an occurrence of an exception while reading a task. + * + * @param logger + * reference to the logger + * @param task + * reference to the task + * @param e + * reference to the exception + */ + public static void logReadTaskException(final Logger logger, final Task<Revision> task, + final Exception e) + { + + if (task != null) { + logger.logException(Level.ERROR, "Error while reading a task: " + task, e); + } + else { + logger.logException(Level.ERROR, "Error while reading an unknown task", e); + } + } + + /** + * Logs an occurrence of an OutOfMemoryError while reading a task. + * + * @param logger + * reference to the logger + * @param task + * reference to the task + * @param e + * reference to the error + */ + public static void logReadTaskOutOfMemoryError(final Logger logger, final Task<Revision> task, + final OutOfMemoryError e) + { + + if (task != null) { + logger.logError(Level.WARN, "Error while reading a task: " + task, e); + } + else { + logger.logError(Level.WARN, "Error while reading an unknown task", e); + } + } + + /** + * Logs the failed retrieval of an archive descriptor. + * + * @param logger + * reference to the logger + */ + public static void logRetrieveArchiveFailed(final Logger logger) + { + + logger.logMessage(Level.WARN, "Consumer failed to obtain an archive"); + } + + /** + * Logs the status of the article consumer. + * + * @param logger + * reference to the logger + * @param articleReader + * reference to the ArticleReader + * @param startTime + * start time + * @param sleepingTime + * time the consumer has slept + * @param workingTime + * time the consumer was working + */ + public static void logStatus(final Logger logger, final ArticleReaderInterface articleReader, + final long startTime, final long sleepingTime, final long workingTime) + { + + String message = "Consumer-Status-Report [" + + Time.toClock(System.currentTimeMillis() - startTime) + "]"; + + if (articleReader != null) { + message += "\tPOSITION <" + articleReader.getBytePosition() + ">"; + } + + message += "\tEFFICIENCY\t " + MathUtilities.percentPlus(workingTime, sleepingTime) + + "\tWORK [" + Time.toClock(workingTime) + "]" + "\tSLEEP [" + + Time.toClock(sleepingTime) + "]"; + + logger.logMessage(Level.DEBUG, message); + } + + /** + * Logs the occurrence of an ArticleReaderException. + * + * @param logger + * reference to the logger + * @param e + * reference to the exception + */ + public static void logTaskReaderException(final Logger logger, final ArticleReaderException e) + { + + logger.logException(Level.ERROR, "TaskReaderException", e); + } + + /** + * No object - utility class + */ + private ArticleConsumerLogMessages() + { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java index 8c2b1fc3..7a284d92 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/ConsumerLogMessages.java @@ -26,142 +26,171 @@ /** * This class contains the english localized log messages for Consumers. * <p> + * * @deprecated To be removed without replacement. */ // TODO: This file should be replaced with resource files. @Deprecated(since = "1.1", forRemoval = true) -public final class ConsumerLogMessages { - - /** - * Logs the start of a consumer. - * - * @param logger reference to the logger - */ - public static void logConsumerRunning(final Logger logger) { - logger.logMessage(Level.INFO, "Consumer is up and running"); - } - - /** - * Logs an error. - * - * @param logger reference to the logger - * @param e reference to the error - */ - public static void logError(final Logger logger, final Error e) { - logger.logError(Level.ERROR, "Unexpected Error", e); - } - - /** - * Logs an exception. - * - * @param logger reference to the logger - * @param e reference to the exception - */ - public static void logException(final Logger logger, final Exception e) { - logger.logException(Level.ERROR, "Unexpected Exception", e); - } - - /** - * Logs the initialization of a consumer. - * - * @param logger reference to the logger - */ - public static void logInitialization(final Logger logger) { - logger.logMessage(Level.INFO, "Consumer initialized [LogLevel: " - + logger.getLogLevel() + "]"); - } - - /** - * Logs the receival of the kill signal. - * - * @param logger reference to the logger - */ - public static void logKillSignalMessage(final Logger logger) { - logger.logMessage(Level.INFO, "Consumer received KILL Signal"); - } - - /** - * Logs the receival of the ping signal. - * - * @param logger reference to the logger - */ - public static void logPingSignal(final Logger logger) { - logger.logMessage(Level.INFO, "Consumer received PING Signal"); - } - - /** - * Logs the shutdown of the consumer. - * - * @param logger reference to the logger - * @param endTime time - */ - public static void logShutdown(final Logger logger, final long endTime) { - logger.logMessage(Level.INFO, - "Consumer initiates SHUTDOWN\t" + Time.toClock(endTime)); - } - - /** - * Logs that the consumer is sleeping. - * - * @param logger reference to the logger - */ - public static void logSleep(final Logger logger) { - logger.logMessage(Level.DEBUG, "Consumer is sleeping"); - } - - /** - * Logs the receival of the start signal. - * - * @param logger reference to the logger - */ - public static void logStartSignalMessage(final Logger logger) { - logger.logMessage(Level.INFO, "Consumer received START Signal"); - } - - /** - * Logs the status of the consumer. - * - * @param logger reference to the logger - * @param startTime start time - * @param sleepingTime time the consumer has slept - * @param workingTime time the consumer was working - */ - public static void logStatus(final Logger logger, final long startTime, - final long sleepingTime, final long workingTime) { - - logger.logMessage(Level.DEBUG, - "Consumer-Status-Report [" - + Time.toClock(System.currentTimeMillis() - startTime) - + "]" + "\tEFFICIENCY\t " - + MathUtilities.percentPlus(workingTime, sleepingTime) - + "\tWORK [" + Time.toClock(workingTime) + "]" - + "\tSLEEP [" + Time.toClock(sleepingTime) + "]"); - } - - /** - * Logs the receival of the stop signal. - * - * @param logger reference to the logger - */ - public static void logStopSignal(final Logger logger) { - logger.logMessage(Level.INFO, "Consumer received STOP Signal"); - } - - /** - * Logs the occurrence of a TimeoutException. - * - * @param logger reference to the logger - * @param e reference to the exception - */ - public static void logTimeoutException(final Logger logger, - final TimeoutException e) { - - logger.logException(Level.WARN, "TimeoutException", e); - } - - /** - * No object - utility class - */ - private ConsumerLogMessages() { - } +public final class ConsumerLogMessages +{ + + /** + * Logs the start of a consumer. + * + * @param logger + * reference to the logger + */ + public static void logConsumerRunning(final Logger logger) + { + logger.logMessage(Level.INFO, "Consumer is up and running"); + } + + /** + * Logs an error. + * + * @param logger + * reference to the logger + * @param e + * reference to the error + */ + public static void logError(final Logger logger, final Error e) + { + logger.logError(Level.ERROR, "Unexpected Error", e); + } + + /** + * Logs an exception. + * + * @param logger + * reference to the logger + * @param e + * reference to the exception + */ + public static void logException(final Logger logger, final Exception e) + { + logger.logException(Level.ERROR, "Unexpected Exception", e); + } + + /** + * Logs the initialization of a consumer. + * + * @param logger + * reference to the logger + */ + public static void logInitialization(final Logger logger) + { + logger.logMessage(Level.INFO, + "Consumer initialized [LogLevel: " + logger.getLogLevel() + "]"); + } + + /** + * Logs the receival of the kill signal. + * + * @param logger + * reference to the logger + */ + public static void logKillSignalMessage(final Logger logger) + { + logger.logMessage(Level.INFO, "Consumer received KILL Signal"); + } + + /** + * Logs the receival of the ping signal. + * + * @param logger + * reference to the logger + */ + public static void logPingSignal(final Logger logger) + { + logger.logMessage(Level.INFO, "Consumer received PING Signal"); + } + + /** + * Logs the shutdown of the consumer. + * + * @param logger + * reference to the logger + * @param endTime + * time + */ + public static void logShutdown(final Logger logger, final long endTime) + { + logger.logMessage(Level.INFO, "Consumer initiates SHUTDOWN\t" + Time.toClock(endTime)); + } + + /** + * Logs that the consumer is sleeping. + * + * @param logger + * reference to the logger + */ + public static void logSleep(final Logger logger) + { + logger.logMessage(Level.DEBUG, "Consumer is sleeping"); + } + + /** + * Logs the receival of the start signal. + * + * @param logger + * reference to the logger + */ + public static void logStartSignalMessage(final Logger logger) + { + logger.logMessage(Level.INFO, "Consumer received START Signal"); + } + + /** + * Logs the status of the consumer. + * + * @param logger + * reference to the logger + * @param startTime + * start time + * @param sleepingTime + * time the consumer has slept + * @param workingTime + * time the consumer was working + */ + public static void logStatus(final Logger logger, final long startTime, final long sleepingTime, + final long workingTime) + { + + logger.logMessage(Level.DEBUG, "Consumer-Status-Report [" + + Time.toClock(System.currentTimeMillis() - startTime) + "]" + "\tEFFICIENCY\t " + + MathUtilities.percentPlus(workingTime, sleepingTime) + "\tWORK [" + + Time.toClock(workingTime) + "]" + "\tSLEEP [" + Time.toClock(sleepingTime) + "]"); + } + + /** + * Logs the receival of the stop signal. + * + * @param logger + * reference to the logger + */ + public static void logStopSignal(final Logger logger) + { + logger.logMessage(Level.INFO, "Consumer received STOP Signal"); + } + + /** + * Logs the occurrence of a TimeoutException. + * + * @param logger + * reference to the logger + * @param e + * reference to the exception + */ + public static void logTimeoutException(final Logger logger, final TimeoutException e) + { + + logger.logException(Level.WARN, "TimeoutException", e); + } + + /** + * No object - utility class + */ + private ConsumerLogMessages() + { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java index 410a609a..60e4b425 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/DiffConsumerLogMessages.java @@ -28,116 +28,137 @@ /** * This class contains the english localized log messages for DiffConsumers. * <p> + * * @deprecated To be removed without replacement. */ // TODO: This file should be replaced with resource files. @Deprecated(since = "1.1", forRemoval = true) -public class DiffConsumerLogMessages { - - /** - * Logs the processing of a revision task. - * - * @param logger reference to the logger - * @param article reference to the revision task - * @param time time - */ - public static void logArticleProcessed(final Logger logger, - final Task<Revision> article, long time) { - - logger.logMessage(Level.INFO, "Generated Diff\t" + Time.toClock(time) - + "\t" + article.toString()); - } - - /** - * Logs the processing of a revision task. - * - * @param logger reference to the logger - * @param article reference to the revision task - * @param time time - * @param transmittingTime time that the transfer of data to the producer needed - */ - public static void logArticleProcessed(final Logger logger, - final Task<Revision> article, long time, long transmittingTime) { - - logger.logMessage(Level.INFO, - "Generated Diff\t" + Time.toClock(time) + "\t" - + Time.toClock(transmittingTime) + "\t" - + article.toString()); - } - - /** - * Logs the occurrence of a DiffException. - * - * @param logger reference to the logger - * @param e reference to the exception - */ - public static void logDiffException(final Logger logger, - final DiffException e) { - - logger.logException(Level.ERROR, "DiffException", e); - } - - /** - * Logs the receival of an end task. - * - * @param logger reference to the logger - */ - public static void logEndTaskReceived(final Logger logger) { - - logger.logMessage(Level.INFO, - "Consumer initiates SHUTDOWN: EndTask received"); - } - - /** - * Logs the occurrence of an invalid task type. - * - * @param logger reference to the logger - * @param type type of task - */ - public static void logInvalidTaskType(final Logger logger, - final TaskTypes type) { - - logger.logMessage(Level.INFO, "Invalid TaskType: " + type); - } - - /** - * Logs the occurrence of an TaskOutOfMemoryError while reading a revision - * task. - * - * @param logger reference to the logger - * @param task reference to the revision task - * @param e reference to the error - */ - public static void logReadTaskOutOfMemoryError(final Logger logger, - final Task<Revision> task, final OutOfMemoryError e) { - - if (task != null) { - logger.logError(Level.WARN, "Error while reading a task: " - + task, e); - } else { - logger.logError(Level.WARN, - "Error while reading an unknown task", e); +public class DiffConsumerLogMessages +{ + + /** + * Logs the processing of a revision task. + * + * @param logger + * reference to the logger + * @param article + * reference to the revision task + * @param time + * time + */ + public static void logArticleProcessed(final Logger logger, final Task<Revision> article, + long time) + { + + logger.logMessage(Level.INFO, + "Generated Diff\t" + Time.toClock(time) + "\t" + article.toString()); + } + + /** + * Logs the processing of a revision task. + * + * @param logger + * reference to the logger + * @param article + * reference to the revision task + * @param time + * time + * @param transmittingTime + * time that the transfer of data to the producer needed + */ + public static void logArticleProcessed(final Logger logger, final Task<Revision> article, + long time, long transmittingTime) + { + + logger.logMessage(Level.INFO, "Generated Diff\t" + Time.toClock(time) + "\t" + + Time.toClock(transmittingTime) + "\t" + article.toString()); + } + + /** + * Logs the occurrence of a DiffException. + * + * @param logger + * reference to the logger + * @param e + * reference to the exception + */ + public static void logDiffException(final Logger logger, final DiffException e) + { + + logger.logException(Level.ERROR, "DiffException", e); + } + + /** + * Logs the receival of an end task. + * + * @param logger + * reference to the logger + */ + public static void logEndTaskReceived(final Logger logger) + { + + logger.logMessage(Level.INFO, "Consumer initiates SHUTDOWN: EndTask received"); + } + + /** + * Logs the occurrence of an invalid task type. + * + * @param logger + * reference to the logger + * @param type + * type of task + */ + public static void logInvalidTaskType(final Logger logger, final TaskTypes type) + { + + logger.logMessage(Level.INFO, "Invalid TaskType: " + type); + } + + /** + * Logs the occurrence of an TaskOutOfMemoryError while reading a revision task. + * + * @param logger + * reference to the logger + * @param task + * reference to the revision task + * @param e + * reference to the error + */ + public static void logReadTaskOutOfMemoryError(final Logger logger, final Task<Revision> task, + final OutOfMemoryError e) + { + + if (task != null) { + logger.logError(Level.WARN, "Error while reading a task: " + task, e); + } + else { + logger.logError(Level.WARN, "Error while reading an unknown task", e); + } + } + + /** + * Logs the start of the processing of an revision task. + * + * @param logger + * reference to the logger + * @param article + * reference to the revision task + * @param time + * time + * @param transmittingTime + * time that the transfer of data to the producer needed + */ + public static void logStartArticleProcessing(final Logger logger, final Task<Revision> article, + long time, long transmittingTime) + { + + logger.logMessage(Level.TRACE, "Start Procssing Task\t" + article.toString()); + } + + /** + * No object - utility class + */ + private DiffConsumerLogMessages() + { } - } - - /** - * Logs the start of the processing of an revision task. - * - * @param logger reference to the logger - * @param article reference to the revision task - * @param time time - * @param transmittingTime time that the transfer of data to the producer needed - */ - public static void logStartArticleProcessing(final Logger logger, - final Task<Revision> article, long time, long transmittingTime) { - - logger.logMessage(Level.TRACE, - "Start Procssing Task\t" + article.toString()); - } - - /** - * No object - utility class - */ - private DiffConsumerLogMessages() { - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java index 1f50e77f..2accff79 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/logging/messages/consumer/SQLConsumerLogMessages.java @@ -27,68 +27,85 @@ /** * This class contains the english localized log messages for SQLConsumers. * <p> + * * @deprecated To be removed without replacement. */ // TODO: This file should be replaced with resource files. @Deprecated(since = "1.1", forRemoval = true) -public class SQLConsumerLogMessages { +public class SQLConsumerLogMessages +{ - /** - * Logs the processing of a diff task. - * - * @param logger reference to the logger - * @param diff reference to the task - * @param time time - */ - public static void logDiffProcessed(final Logger logger, - final Task<Diff> diff, final long time) { + /** + * Logs the processing of a diff task. + * + * @param logger + * reference to the logger + * @param diff + * reference to the task + * @param time + * time + */ + public static void logDiffProcessed(final Logger logger, final Task<Diff> diff, final long time) + { - logger.logMessage(Level.INFO, "Generated Entry\t" + Time.toClock(time) + "\t" + diff.toString()); - } + logger.logMessage(Level.INFO, + "Generated Entry\t" + Time.toClock(time) + "\t" + diff.toString()); + } - /** - * Logs the creation of an output file. - * - * @param logger reference to the logger - * @param path path of the output file - */ - public static void logFileCreation(final Logger logger, final String path) { + /** + * Logs the creation of an output file. + * + * @param logger + * reference to the logger + * @param path + * path of the output file + */ + public static void logFileCreation(final Logger logger, final String path) + { - logger.logMessage(Level.INFO, "New File created:\t" + path); - } + logger.logMessage(Level.INFO, "New File created:\t" + path); + } - /** - * Logs the occurrence of an OutOfMemoryError while reading a task. - * - * @param logger reference to the logger - * @param task reference to the revision task - * @param e reference to the error - */ - public static void logReadTaskOutOfMemoryError(final Logger logger, - final Task<Diff> task, final OutOfMemoryError e) { + /** + * Logs the occurrence of an OutOfMemoryError while reading a task. + * + * @param logger + * reference to the logger + * @param task + * reference to the revision task + * @param e + * reference to the error + */ + public static void logReadTaskOutOfMemoryError(final Logger logger, final Task<Diff> task, + final OutOfMemoryError e) + { - if (task != null) { - logger.logError(Level.WARN, "Error while reading a task: " + task, e); - } else { - logger.logError(Level.WARN, "Error while reading an unknown task", e); + if (task != null) { + logger.logError(Level.WARN, "Error while reading a task: " + task, e); + } + else { + logger.logError(Level.WARN, "Error while reading an unknown task", e); + } } - } - /** - * Logs the occurrence of an SqlConsumerException. - * - * @param logger reference to the logger - * @param e reference to the exception - */ - public static void logSQLConsumerException(final Logger logger, - final SQLConsumerException e) { + /** + * Logs the occurrence of an SqlConsumerException. + * + * @param logger + * reference to the logger + * @param e + * reference to the exception + */ + public static void logSQLConsumerException(final Logger logger, final SQLConsumerException e) + { - logger.logException(Level.ERROR, "SQLConsumerException", e); - } + logger.logException(Level.ERROR, "SQLConsumerException", e); + } - /** - * No object - utility class - */ - private SQLConsumerLogMessages() { - } + /** + * No object - utility class + */ + private SQLConsumerLogMessages() + { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java index 7949d0a1..2bc69451 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/LetterNode.java @@ -22,119 +22,134 @@ /** * LetterNode This node represents a node of the keyword tree. * - * @param <V> contained value + * @param <V> + * contained value */ -public class LetterNode<V> { - - /** - * Alphabetic index of successor nodes - */ - private final HashMap<Character, LetterNode<V>> nodes; - - /** - * Flag, whether this node contains a valid key or not - */ - private boolean isKeyword; - - /** - * Contained keyword - */ - private final String word; - - /** - * Contained value - related to the keyword - */ - private V value; - - /** - * (Constructor) Creates a empty LetterNode. - */ - public LetterNode() { - this.nodes = new HashMap<>(); - this.isKeyword = false; - this.word = ""; - } - - /** - * (Constructor) Creates a LetterNode with a keyword. - * - * @param word keyword - */ - public LetterNode(final String word) { - this.nodes = new HashMap<>(); - this.isKeyword = false; - this.word = word; - } - - /** - * Adds a word and its related value. - * - * @param word keyword - * @param value related value - */ - public void add(final String word, final V value) { - - char c = word.charAt(0); - - LetterNode<V> node = get(c); - if (node == null) { - node = new LetterNode<>(this.word + c); +public class LetterNode<V> +{ + + /** + * Alphabetic index of successor nodes + */ + private final HashMap<Character, LetterNode<V>> nodes; + + /** + * Flag, whether this node contains a valid key or not + */ + private boolean isKeyword; + + /** + * Contained keyword + */ + private final String word; + + /** + * Contained value - related to the keyword + */ + private V value; + + /** + * (Constructor) Creates a empty LetterNode. + */ + public LetterNode() + { + this.nodes = new HashMap<>(); + this.isKeyword = false; + this.word = ""; } - this.nodes.put(c, node); - if (word.length() == 1) { - node.isKeyword = true; - node.value = value; - return; + /** + * (Constructor) Creates a LetterNode with a keyword. + * + * @param word + * keyword + */ + public LetterNode(final String word) + { + this.nodes = new HashMap<>(); + this.isKeyword = false; + this.word = word; } - node.add(word.substring(1), value); - } - - /** - * Returns the keyword. - * - * @return keyword - */ - public String getWord() { - return this.word; - } - - /** - * Returns the related value. - * - * @return related value - */ - public V getValue() { - return this.value; - } - - /** - * Returns the specified successor node. - * - * @param c character - * @return successor node or NULL if the specified node is not available - */ - public LetterNode<V> get(char c) { - return this.nodes.get(c); - } - - /** - * Checks whether the specified successor node is contained. - * - * @param c character - * @return TRUE | FALSE - */ - public boolean contains(char c) { - return this.nodes.containsKey(c); - } - - /** - * Returns whether this node contains a keyword or not. - * - * @return TRUE | FALSE - */ - public boolean isKeyword() { - return this.isKeyword; - } + /** + * Adds a word and its related value. + * + * @param word + * keyword + * @param value + * related value + */ + public void add(final String word, final V value) + { + + char c = word.charAt(0); + + LetterNode<V> node = get(c); + if (node == null) { + node = new LetterNode<>(this.word + c); + } + this.nodes.put(c, node); + + if (word.length() == 1) { + node.isKeyword = true; + node.value = value; + return; + } + + node.add(word.substring(1), value); + } + + /** + * Returns the keyword. + * + * @return keyword + */ + public String getWord() + { + return this.word; + } + + /** + * Returns the related value. + * + * @return related value + */ + public V getValue() + { + return this.value; + } + + /** + * Returns the specified successor node. + * + * @param c + * character + * @return successor node or NULL if the specified node is not available + */ + public LetterNode<V> get(char c) + { + return this.nodes.get(c); + } + + /** + * Checks whether the specified successor node is contained. + * + * @param c + * character + * @return TRUE | FALSE + */ + public boolean contains(char c) + { + return this.nodes.containsKey(c); + } + + /** + * Returns whether this node contains a keyword or not. + * + * @return TRUE | FALSE + */ + public boolean isKeyword() + { + return this.isKeyword; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java index d8193bce..bdc4075d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MathUtilities.java @@ -20,85 +20,99 @@ /** * Mathematical functions */ -public class MathUtilities { +public class MathUtilities +{ - /** - * No object - utility class - */ - private MathUtilities() { - } + /** + * No object - utility class + */ + private MathUtilities() + { + } - /** - * Rounds the given number to a precision of two after digit numbers. - * - * @param v number - * @return rounded number - */ - public static double round(final double v) { - return ((long) (v * 100.)) / 100.; - } + /** + * Rounds the given number to a precision of two after digit numbers. + * + * @param v + * number + * @return rounded number + */ + public static double round(final double v) + { + return ((long) (v * 100.)) / 100.; + } - /** - * Rounds the result of a / (a + b) to a precision of two after digit - * numbers. - * - * @param a value a - * @param b value b - * @return xx.xx - */ - public static double percentPlus(final double a, final double b) { - return round((double) a / (double) (a + b)); - } + /** + * Rounds the result of a / (a + b) to a precision of two after digit numbers. + * + * @param a + * value a + * @param b + * value b + * @return xx.xx + */ + public static double percentPlus(final double a, final double b) + { + return round((double) a / (double) (a + b)); + } - /** - * Rounds the result of a / (a + b) to a precision of two after digit - * numbers. - * - * @param a value a - * @param b value b - * @return xx.xx - */ - public static double percRoundPlus(final double a, final double b) { - return ((long) ((a / (a + b)) * 10000) / 100.); - } + /** + * Rounds the result of a / (a + b) to a precision of two after digit numbers. + * + * @param a + * value a + * @param b + * value b + * @return xx.xx + */ + public static double percRoundPlus(final double a, final double b) + { + return ((long) ((a / (a + b)) * 10000) / 100.); + } - /** - * Rounds the result of a / b to a precision of two after digit numbers. - * - * @param a value a - * @param b value b - * @return xx.xx - */ - public static double percentDiv(final double a, final double b) { - return ((long) ((a / b) * 10000) / 100.); - } + /** + * Rounds the result of a / b to a precision of two after digit numbers. + * + * @param a + * value a + * @param b + * value b + * @return xx.xx + */ + public static double percentDiv(final double a, final double b) + { + return ((long) ((a / b) * 10000) / 100.); + } - /** - * Returns the result of (a / b) as a percentage string - * - * @param a value a - * @param b value b - * @return xx.xx% - */ - public static String percentFrom(final double a, final double b) { + /** + * Returns the result of (a / b) as a percentage string + * + * @param a + * value a + * @param b + * value b + * @return xx.xx% + */ + public static String percentFrom(final double a, final double b) + { - double bVal = b; + double bVal = b; - if (bVal == 0.) { - bVal = 1.; - } + if (bVal == 0.) { + bVal = 1.; + } - StringBuilder rep = new StringBuilder(); - double d = ((long) ((a / bVal) * 10000) / 100.); - if (d < 10.0) { - rep.append('0'); - } + StringBuilder rep = new StringBuilder(); + double d = ((long) ((a / bVal) * 10000) / 100.); + if (d < 10.0) { + rep.append('0'); + } - rep.append(d); - while (rep.length() < 5) { - rep.append('0'); - } + rep.append(d); + while (rep.length() < 5) { + rep.append('0'); + } - return rep + "%"; - } + return rep + "%"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java index 7662b1d1..a6ba400a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/MultipleKeywordTree.java @@ -21,100 +21,109 @@ import java.util.List; /** - * This class represents a keyword tree and is used to process or to search a - * character sequence. + * This class represents a keyword tree and is used to process or to search a character sequence. * <p> * This keyword tree can be used for overlapping keywords. * - * @param <V> related value + * @param <V> + * related value */ -public class MultipleKeywordTree<V> { - - /** - * Reference to the root - */ - private final LetterNode<V> root; - - /** - * List of current nodes - */ - private List<LetterNode<V>> currentList; - - /** - * List of successor nodes - */ - private final List<LetterNode<V>> hits; - - /** - * Creates an empty MultipleKeywordTree object. - */ - public MultipleKeywordTree() { - root = new LetterNode<>(); - this.currentList = new ArrayList<>(); - this.hits = new ArrayList<>(); - this.currentList.add(root); - reset(); - } - - /** - * Adds a keyword and its related value. - * - * @param s keyword - * @param value related value - */ - public void addKeyword(final String s, final V value) { - root.add(s, value); - } - - /** - * Checks whether the character is related to one of the current nodes (the - * root node is always a current node). - * <p> - * After the comparison the list of current nodes will be replaced. - * - * @param c character - * @return TRUE if successor nodes could be identified FALSE otherwise - */ - public boolean check(final char c) { - - List<LetterNode<V>> newList = new ArrayList<>(); - newList.add(root); - - LetterNode<V> current; - hits.clear(); - - int size = this.currentList.size(); - for (int i = 0; i < size; i++) { - current = this.currentList.get(i); - - current = current.get(c); - if (current != null) { - newList.add(current); - - if (current.isKeyword()) { - hits.add(current); +public class MultipleKeywordTree<V> +{ + + /** + * Reference to the root + */ + private final LetterNode<V> root; + + /** + * List of current nodes + */ + private List<LetterNode<V>> currentList; + + /** + * List of successor nodes + */ + private final List<LetterNode<V>> hits; + + /** + * Creates an empty MultipleKeywordTree object. + */ + public MultipleKeywordTree() + { + root = new LetterNode<>(); + this.currentList = new ArrayList<>(); + this.hits = new ArrayList<>(); + this.currentList.add(root); + reset(); + } + + /** + * Adds a keyword and its related value. + * + * @param s + * keyword + * @param value + * related value + */ + public void addKeyword(final String s, final V value) + { + root.add(s, value); + } + + /** + * Checks whether the character is related to one of the current nodes (the root node is always + * a current node). + * <p> + * After the comparison the list of current nodes will be replaced. + * + * @param c + * character + * @return TRUE if successor nodes could be identified FALSE otherwise + */ + public boolean check(final char c) + { + + List<LetterNode<V>> newList = new ArrayList<>(); + newList.add(root); + + LetterNode<V> current; + hits.clear(); + + int size = this.currentList.size(); + for (int i = 0; i < size; i++) { + current = this.currentList.get(i); + + current = current.get(c); + if (current != null) { + newList.add(current); + + if (current.isKeyword()) { + hits.add(current); + } + } } - } + + this.currentList = newList; + return !hits.isEmpty(); + } + + /** + * Resets the list of current node to only contain the root node. + */ + public void reset() + { + this.currentList.clear(); + this.currentList.add(root); } - this.currentList = newList; - return !hits.isEmpty(); - } - - /** - * Resets the list of current node to only contain the root node. - */ - public void reset() { - this.currentList.clear(); - this.currentList.add(root); - } - - /** - * Returns the list of successor nodes. - * - * @return list of successor nodes - */ - public List<LetterNode<V>> getHits() { - return this.hits; - } + /** + * Returns the list of successor nodes. + * + * @return list of successor nodes + */ + public List<LetterNode<V>> getHits() + { + return this.hits; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java index 89b6143a..3bfa8287 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/SingleKeywordTree.java @@ -18,81 +18,91 @@ package org.dkpro.jwpl.revisionmachine.common.util; /** - * This class represents a keyword tree and is used to process or to search a - * character sequence. + * This class represents a keyword tree and is used to process or to search a character sequence. * <p> * This keyword tree can only be used for non overlapping keywords. * - * @param <V> related value + * @param <V> + * related value */ -public class SingleKeywordTree<V> { +public class SingleKeywordTree<V> +{ - /** - * Reference to the root - */ - private final LetterNode<V> root; + /** + * Reference to the root + */ + private final LetterNode<V> root; - /** - * Reference to the current node - */ - private LetterNode<V> current; + /** + * Reference to the current node + */ + private LetterNode<V> current; - /** - * Creates an empty SingleKeywordTree object. - */ - public SingleKeywordTree() { - root = new LetterNode<>(); - reset(); - } + /** + * Creates an empty SingleKeywordTree object. + */ + public SingleKeywordTree() + { + root = new LetterNode<>(); + reset(); + } - /** - * Adds a keyword and its related value. - * - * @param s keyword - * @param value related value - */ - public void addKeyword(final String s, final V value) { - root.add(s, value); - } + /** + * Adds a keyword and its related value. + * + * @param s + * keyword + * @param value + * related value + */ + public void addKeyword(final String s, final V value) + { + root.add(s, value); + } - /** - * Checks whether the character is related to the currently used node. If - * the comparison fails the keyword tree will be reseted to its root node, - * otherwise the related node will replace the current node. - * - * @param c character - * @return TRUE if the current node contains a keyword FALSE otherwise - */ - public boolean check(final char c) { - current = current.get(c); - if (current == null) { - reset(); + /** + * Checks whether the character is related to the currently used node. If the comparison fails + * the keyword tree will be reseted to its root node, otherwise the related node will replace + * the current node. + * + * @param c + * character + * @return TRUE if the current node contains a keyword FALSE otherwise + */ + public boolean check(final char c) + { + current = current.get(c); + if (current == null) { + reset(); + } + return current.isKeyword(); } - return current.isKeyword(); - } - /** - * Resets the current node with the root node. - */ - public void reset() { - this.current = root; - } + /** + * Resets the current node with the root node. + */ + public void reset() + { + this.current = root; + } - /** - * Returns the keyword of the current node. - * - * @return keyword - */ - public String getWord() { - return this.current.getWord(); - } + /** + * Returns the keyword of the current node. + * + * @return keyword + */ + public String getWord() + { + return this.current.getWord(); + } - /** - * Returns the related value of the current node. - * - * @return related value - */ - public V getValue() { - return this.current.getValue(); - } + /** + * Returns the related value of the current node. + * + * @return related value + */ + public V getValue() + { + return this.current.getValue(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java index a30e6bc0..54df2645 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Surrogates.java @@ -20,58 +20,64 @@ /** * This utility class contains some surrogate related methods. */ -public class Surrogates { +public class Surrogates +{ - /** - * No object - utility class - */ - private Surrogates() { - } + /** + * No object - utility class + */ + private Surrogates() + { + } + + /** + * Returns whether a surrogate character was contained in the specified input. + * + * @param input + * input + * @return if a surrogate character was contained or not + */ + public static boolean scan(final char[] input) + { - /** - * Returns whether a surrogate character was contained in the specified - * input. - * - * @param input input - * @return if a surrogate character was contained or not - */ - public static boolean scan(final char[] input) { + int surLow = 0xD800; + int surHgh = 0xDFFF; - int surLow = 0xD800; - int surHgh = 0xDFFF; + int end = input.length; + for (int i = 0; i < end; i++) { + if ((int) input[i] >= surLow && input[i] <= surHgh) { + return true; + } + } - int end = input.length; - for (int i = 0; i < end; i++) { - if ((int) input[i] >= surLow && input[i] <= surHgh) { - return true; - } + return false; } - return false; - } + /** + * Replaces all surrogates characters with '?'. + * + * @param input + * input + * @return input with '?' instead of surrogates characters + */ + public static char[] replace(final char[] input) + { - /** - * Replaces all surrogates characters with '?'. - * - * @param input input - * @return input with '?' instead of surrogates characters - */ - public static char[] replace(final char[] input) { + int surLow = 0xD800; + int surHgh = 0xDFFF; - int surLow = 0xD800; - int surHgh = 0xDFFF; + int end = input.length; + char[] output = new char[end]; - int end = input.length; - char[] output = new char[end]; + for (int i = 0; i < end; i++) { + if ((int) input[i] >= surLow && input[i] <= surHgh) { + output[i] = '?'; + } + else { + output[i] = input[i]; + } + } - for (int i = 0; i < end; i++) { - if ((int) input[i] >= surLow && input[i] <= surHgh) { - output[i] = '?'; - } else { - output[i] = input[i]; - } + return output; } - - return output; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java index 5517844f..35a0dbd9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/Time.java @@ -20,174 +20,181 @@ /** * This class transform milliseconds to a clock representation. * <p> - * A clock representation describes the time (HH:MM:SS:sss) and is used for - * measuring the processing times. + * A clock representation describes the time (HH:MM:SS:sss) and is used for measuring the processing + * times. */ -public class Time { - - /** - * Weeks - */ - private final short weeks; - - /** - * Days - */ - private final short days; - - /** - * Hours - */ - private final short hours; - - /** - * Minutes - */ - private final short minutes; - - /** - * Seconds - */ - private final short seconds; - - /** - * Milliseconds - */ - private final short milliseconds; - - /** - * (Constructor) Creates a new time information transforming the millisecond - * value into a clock representation. - * - * @param time milliseconds - */ - public Time(final long time) { - - long ttime = time; - - this.milliseconds = (short) (ttime % 1000); - ttime = ttime / 1000; - - this.seconds = (short) (ttime % 60); - ttime = ttime / 60; - - this.minutes = (short) (ttime % 60); - ttime = ttime / 60; - - this.hours = (short) (ttime % 24); - ttime = ttime / 24; - - this.days = (short) (ttime % 7); - this.weeks = (short) (ttime / 7); - } - - /** - * Returns the textual description of the time value. - */ - public String toString() { - StringBuilder s = new StringBuilder(); - - boolean appended = false; - if (this.weeks != 0 || appended) { - appended = true; - s.append(this.weeks + " Wochen "); +public class Time +{ + + /** + * Weeks + */ + private final short weeks; + + /** + * Days + */ + private final short days; + + /** + * Hours + */ + private final short hours; + + /** + * Minutes + */ + private final short minutes; + + /** + * Seconds + */ + private final short seconds; + + /** + * Milliseconds + */ + private final short milliseconds; + + /** + * (Constructor) Creates a new time information transforming the millisecond value into a clock + * representation. + * + * @param time + * milliseconds + */ + public Time(final long time) + { + + long ttime = time; + + this.milliseconds = (short) (ttime % 1000); + ttime = ttime / 1000; + + this.seconds = (short) (ttime % 60); + ttime = ttime / 60; + + this.minutes = (short) (ttime % 60); + ttime = ttime / 60; + + this.hours = (short) (ttime % 24); + ttime = ttime / 24; + + this.days = (short) (ttime % 7); + this.weeks = (short) (ttime / 7); } - if (this.days != 0 || appended) { - appended = true; - s.append(this.days + " Tage "); - } - if (this.hours != 0 || appended) { - appended = true; - s.append(this.hours + " Stunden "); - } - if (this.minutes != 0 || appended) { - appended = true; - s.append(this.minutes + " Minuten "); - } - if (this.seconds != 0 || appended) { - appended = true; - s.append(this.seconds + " Sekunden "); - } - if (this.milliseconds != 0 || appended) { - s.append(this.milliseconds + " Milisekunden"); - } - - return s.toString(); - } - - /** - * Returns the clock description of the time value. - */ - public String toClock() { - StringBuilder s = new StringBuilder(); - s.append(((this.weeks * 7 + this.days) * 24 + this.hours) + ":"); - if (this.minutes < 10) { - s.append('0'); + /** + * Returns the textual description of the time value. + */ + public String toString() + { + StringBuilder s = new StringBuilder(); + + boolean appended = false; + if (this.weeks != 0 || appended) { + appended = true; + s.append(this.weeks + " Wochen "); + } + if (this.days != 0 || appended) { + appended = true; + s.append(this.days + " Tage "); + } + if (this.hours != 0 || appended) { + appended = true; + s.append(this.hours + " Stunden "); + } + if (this.minutes != 0 || appended) { + appended = true; + s.append(this.minutes + " Minuten "); + } + if (this.seconds != 0 || appended) { + appended = true; + s.append(this.seconds + " Sekunden "); + } + if (this.milliseconds != 0 || appended) { + s.append(this.milliseconds + " Milisekunden"); + } + + return s.toString(); } - s.append(this.minutes + ":"); - if (this.seconds < 10) { - s.append('0'); - } - s.append(this.seconds + "."); - if (this.milliseconds < 100) { - s.append('0'); - } - if (this.milliseconds < 10) { - s.append('0'); + + /** + * Returns the clock description of the time value. + */ + public String toClock() + { + StringBuilder s = new StringBuilder(); + + s.append(((this.weeks * 7 + this.days) * 24 + this.hours) + ":"); + if (this.minutes < 10) { + s.append('0'); + } + s.append(this.minutes + ":"); + if (this.seconds < 10) { + s.append('0'); + } + s.append(this.seconds + "."); + if (this.milliseconds < 100) { + s.append('0'); + } + if (this.milliseconds < 10) { + s.append('0'); + } + s.append(this.milliseconds); + + return s.toString(); } - s.append(this.milliseconds); - return s.toString(); - } + /** + * Transforms a millisecond value to the clock representation. + * + * @param time + * milliseconds + * @return clock representation + */ + public static String toClock(long time) + { - /** - * Transforms a millisecond value to the clock representation. - * - * @param time milliseconds - * @return clock representation - */ - public static String toClock(long time) { + long ttime = time; - long ttime = time; + short miliseconds = (short) (ttime % 1000); + ttime = ttime / 1000; - short miliseconds = (short) (ttime % 1000); - ttime = ttime / 1000; + short seconds = (short) (ttime % 60); + ttime = ttime / 60; - short seconds = (short) (ttime % 60); - ttime = ttime / 60; + short minutes = (short) (ttime % 60); + ttime = ttime / 60; - short minutes = (short) (ttime % 60); - ttime = ttime / 60; + short hours = (short) (ttime % 24); + ttime = ttime / 24; - short hours = (short) (ttime % 24); - ttime = ttime / 24; + short days = (short) (ttime % 7); + short weeks = (short) (ttime / 7); - short days = (short) (ttime % 7); - short weeks = (short) (ttime / 7); + StringBuilder s = new StringBuilder(); - StringBuilder s = new StringBuilder(); + s.append(((weeks * 7 + days) * 24 + hours) + ":"); - s.append(((weeks * 7 + days) * 24 + hours) + ":"); + if (minutes < 10) { + s.append('0'); + } + s.append(minutes + ":"); - if (minutes < 10) { - s.append('0'); - } - s.append(minutes + ":"); + if (seconds < 10) { + s.append('0'); + } + s.append(seconds + "."); - if (seconds < 10) { - s.append('0'); - } - s.append(seconds + "."); + if (miliseconds < 100) { + s.append('0'); + } + if (miliseconds < 10) { + s.append('0'); + } + s.append(miliseconds); - if (miliseconds < 100) { - s.append('0'); - } - if (miliseconds < 10) { - s.append('0'); + return s.toString(); } - s.append(miliseconds); - - return s.toString(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java index baa84cf8..ca130d96 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLKeys.java @@ -20,144 +20,147 @@ /** * This class contains all keys for wikipedia dump files. */ -public enum WikipediaXMLKeys { - - /** - * Indicates the start of a page - */ - KEY_START_PAGE("<page>"), - - /** - * Indicates the end of a page - */ - KEY_END_PAGE("</page>"), - - /** - * Indicates the start of a title - */ - KEY_START_TITLE("<title>"), - - /** - * Indicates the end of a title - */ - KEY_END_TITLE(""), - - /** - * Indicates the start of an id - */ - KEY_START_ID(""), - - /** - * Indicates the end of an id - */ - KEY_END_ID(""), - - /** - * Indicates the start of a revision - */ - KEY_START_REVISION(""), - - /** - * Indicates the end of a revision - */ - KEY_END_REVISION(""), - - /** - * Indicates the start of a comment - */ - KEY_START_COMMENT(""), - - /** - * Indicates the end of a comment - */ - KEY_END_COMMENT(""), - - /** - * Indicates the start of the contributor ip - */ - KEY_START_IP(""), - - /** - * Indicates the end of the contributor ip - */ - KEY_END_IP(""), - - /** - * Indicates the start of the the contributor username - */ - KEY_START_USERNAME(""), - - /** - * Indicates the end of the contributor username - */ - KEY_END_USERNAME(""), - - /** - * Indicates the start of a timestamp - */ - KEY_START_TIMESTAMP(""), - - /** - * Indicates the end of a timestamp - */ - KEY_END_TIMESTAMP(""), - - /** - * Indicates the start of the contributor info - */ - KEY_START_CONTRIBUTOR(""), - - /** - * Indicates the end of the contributor info - */ - KEY_END_CONTRIBUTOR(""), - - /** - * Indicates the start of the namespace block - */ - KEY_START_NAMESPACES(""), - - /** - * Indicates the end of the namespace block - */ - KEY_END_NAMESPACES(""), - - /** - * Indicates the start of a text segment - */ - KEY_START_TEXT(""), - - /** - * Indicates the end of a text segment - */ - KEY_END_TEXT(""), - - /** - * Indicates that the revision is a minor revision - */ - KEY_MINOR_FLAG(""); - - - /** - * Keyword related to the key - */ - private final String keyword; - - /** - * Creates an enumerator with the speciefied keyword - * - * @param keyword keyword - */ - WikipediaXMLKeys(final String keyword) { - this.keyword = keyword; - } - - /** - * Returns the keyword - * - * @return keyword - */ - public String getKeyword() { - return this.keyword; - } +public enum WikipediaXMLKeys +{ + + /** + * Indicates the start of a page + */ + KEY_START_PAGE(""), + + /** + * Indicates the end of a page + */ + KEY_END_PAGE(""), + + /** + * Indicates the start of a title + */ + KEY_START_TITLE(""), + + /** + * Indicates the end of a title + */ + KEY_END_TITLE(""), + + /** + * Indicates the start of an id + */ + KEY_START_ID(""), + + /** + * Indicates the end of an id + */ + KEY_END_ID(""), + + /** + * Indicates the start of a revision + */ + KEY_START_REVISION(""), + + /** + * Indicates the end of a revision + */ + KEY_END_REVISION(""), + + /** + * Indicates the start of a comment + */ + KEY_START_COMMENT(""), + + /** + * Indicates the end of a comment + */ + KEY_END_COMMENT(""), + + /** + * Indicates the start of the contributor ip + */ + KEY_START_IP(""), + + /** + * Indicates the end of the contributor ip + */ + KEY_END_IP(""), + + /** + * Indicates the start of the the contributor username + */ + KEY_START_USERNAME(""), + + /** + * Indicates the end of the contributor username + */ + KEY_END_USERNAME(""), + + /** + * Indicates the start of a timestamp + */ + KEY_START_TIMESTAMP(""), + + /** + * Indicates the end of a timestamp + */ + KEY_END_TIMESTAMP(""), + + /** + * Indicates the start of the contributor info + */ + KEY_START_CONTRIBUTOR(""), + + /** + * Indicates the end of the contributor info + */ + KEY_END_CONTRIBUTOR(""), + + /** + * Indicates the start of the namespace block + */ + KEY_START_NAMESPACES(""), + + /** + * Indicates the end of the namespace block + */ + KEY_END_NAMESPACES(""), + + /** + * Indicates the start of a text segment + */ + KEY_START_TEXT(""), + + /** + * Indicates the end of a text segment + */ + KEY_END_TEXT(""), + + /** + * Indicates that the revision is a minor revision + */ + KEY_MINOR_FLAG(""); + + /** + * Keyword related to the key + */ + private final String keyword; + + /** + * Creates an enumerator with the speciefied keyword + * + * @param keyword + * keyword + */ + WikipediaXMLKeys(final String keyword) + { + this.keyword = keyword; + } + + /** + * Returns the keyword + * + * @return keyword + */ + public String getKeyword() + { + return this.keyword; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java index f14042ac..126c43a3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/common/util/WikipediaXMLWriter.java @@ -33,279 +33,297 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.info.ArticleInformation; /** - * The WikipediaXMLWriter writes xml representations of task objects to an - * output file. + * The WikipediaXMLWriter writes xml representations of task objects to an output file. *

* This class is used for debug purposes. */ -public class WikipediaXMLWriter { - - /** - * Reference to the writer - */ - private final OutputStreamWriter writer; - - /** - * Creates a WikipediaXMLWriter object. - * - * @param path path of the output file - * @throws IOException if an error occurs while writing the output - */ - public WikipediaXMLWriter(final String path) throws IOException { - this.writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(path)), - StandardCharsets.UTF_8); - } - - /** - * Writes the diff task to the output using wikipedia xml notation. - * - * @param diff Reference to a diff task - * @throws IOException if an error occurs while writing the output - */ - public void writeDiff(final Task diff) throws IOException { - writeDiff(diff, 0); - } - - /** - * Writes a part of the diff task, starting with the given element, to the - * output using wikipedia xml notation. - * - * @param diff Reference to a diff task - * @param start Position of the start element - * @throws IOException if an error occurs while writing the output - */ - public void writeDiff(final Task diff, final int start) throws IOException { - - int size = diff.size(); - Diff d; - String previousRevision = null, currentRevision; - - this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); - - ArticleInformation header = diff.getHeader(); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); - this.writer.write(header.getArticleName()); - this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(header.getArticleId())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t"); - this.writer.write(Integer.toString(diff.getPartCounter())); - this.writer.write("\r\n"); - - for (int i = start; i < size; i++) { - d = diff.get(i); - currentRevision = d.buildRevision(previousRevision); - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + "\r\n"); - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(d.getRevisionID())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t\t"); - this.writer.write(Integer.toString(d.getRevisionCounter())); - this.writer.write("\r\n"); - - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); - this.writer.write(d.getTimeStamp().toString()); - this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + "\r\n"); - - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); - if (currentRevision != null) { - this.writer.write(currentRevision); - previousRevision = currentRevision; - } - this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + "\r\n"); - this.writer.write("\t" + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); - +public class WikipediaXMLWriter +{ + + /** + * Reference to the writer + */ + private final OutputStreamWriter writer; + + /** + * Creates a WikipediaXMLWriter object. + * + * @param path + * path of the output file + * @throws IOException + * if an error occurs while writing the output + */ + public WikipediaXMLWriter(final String path) throws IOException + { + this.writer = new OutputStreamWriter(new BufferedOutputStream(new FileOutputStream(path)), + StandardCharsets.UTF_8); } - this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); - this.writer.flush(); - } - - /** - * Writes the diff task to the output using an xml representation of the - * diff information. - * - * @param diff Reference to a diff task - * @throws IOException if an error occurs while writing the output - */ - public void writeDiffFile(final Task diff) throws IOException { - - int partsCount; - int size = diff.size(); - Diff d; - DiffPart p; - RevisionCodecData codec; + /** + * Writes the diff task to the output using wikipedia xml notation. + * + * @param diff + * Reference to a diff task + * @throws IOException + * if an error occurs while writing the output + */ + public void writeDiff(final Task diff) throws IOException + { + writeDiff(diff, 0); + } - this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); + /** + * Writes a part of the diff task, starting with the given element, to the output using + * wikipedia xml notation. + * + * @param diff + * Reference to a diff task + * @param start + * Position of the start element + * @throws IOException + * if an error occurs while writing the output + */ + public void writeDiff(final Task diff, final int start) throws IOException + { + + int size = diff.size(); + Diff d; + String previousRevision = null, currentRevision; + + this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); + + ArticleInformation header = diff.getHeader(); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); + this.writer.write(header.getArticleName()); + this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(header.getArticleId())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - ArticleInformation header = diff.getHeader(); + this.writer.write("\t"); + this.writer.write(Integer.toString(diff.getPartCounter())); + this.writer.write("\r\n"); - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); - this.writer.write(header.getArticleName()); - this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); + for (int i = start; i < size; i++) { + d = diff.get(i); + currentRevision = d.buildRevision(previousRevision); - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(header.getArticleId())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + "\r\n"); + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(d.getRevisionID())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - this.writer.write("\t"); - this.writer.write(Integer.toString(diff.getPartCounter())); - this.writer.write("\r\n"); + this.writer.write("\t\t"); + this.writer.write(Integer.toString(d.getRevisionCounter())); + this.writer.write("\r\n"); - for (int i = 0; i < size; i++) { - d = diff.get(i); + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); + this.writer.write(d.getTimeStamp().toString()); + this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + "\r\n"); - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + "\r\n"); + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); + if (currentRevision != null) { + this.writer.write(currentRevision); + previousRevision = currentRevision; + } + this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + "\r\n"); + this.writer.write("\t" + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); - codec = d.getCodecData(); - if (!codec.isConverted()) { - codec.totalSizeInBits(); - } + } - this.writer.write("\t\t\r\n"); + this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); + this.writer.flush(); + } - this.writer.write("\t\t\t" + codec.getBlocksizeS() + "\r\n"); - this.writer.write("\t\t\t" + codec.getBlocksizeE() + "\r\n"); - this.writer.write("\t\t\t" + codec.getBlocksizeB() + "\r\n"); - this.writer.write("\t\t\t" + codec.getBlocksizeL() + "\r\n"); + /** + * Writes the diff task to the output using an xml representation of the diff information. + * + * @param diff + * Reference to a diff task + * @throws IOException + * if an error occurs while writing the output + */ + public void writeDiffFile(final Task diff) throws IOException + { + + int partsCount; + int size = diff.size(); + Diff d; + DiffPart p; + RevisionCodecData codec; + + this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); + + ArticleInformation header = diff.getHeader(); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); + this.writer.write(header.getArticleName()); + this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(header.getArticleId())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - this.writer.write("\t\t\r\n"); + this.writer.write("\t"); + this.writer.write(Integer.toString(diff.getPartCounter())); + this.writer.write("\r\n"); - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(d.getRevisionID())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + for (int i = 0; i < size; i++) { + d = diff.get(i); - this.writer.write("\t\t"); - this.writer.write(Integer.toString(d.getRevisionCounter())); - this.writer.write("\r\n"); + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + "\r\n"); - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); - this.writer.write(d.getTimeStamp().toString()); - this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + "\r\n"); + codec = d.getCodecData(); + if (!codec.isConverted()) { + codec.totalSizeInBits(); + } - this.writer.write("\t\t\r\n"); - partsCount = d.size(); - for (int j = 0; j < partsCount; j++) { + this.writer.write("\t\t\r\n"); - p = d.get(j); - this.writer.write("\t\t\t\r\n"); + this.writer.write("\t\t\t" + codec.getBlocksizeS() + "\r\n"); + this.writer.write("\t\t\t" + codec.getBlocksizeE() + "\r\n"); + this.writer.write("\t\t\t" + codec.getBlocksizeB() + "\r\n"); + this.writer.write("\t\t\t" + codec.getBlocksizeL() + "\r\n"); - this.writer.write("\t\t\t\t" + p.getAction() + "\r\n"); - this.writer.write("\t\t\t\t" + p.getStart() + "\r\n"); - this.writer.write("\t\t\t\t" + p.getEnd() + "\r\n"); - if (p.getText() != null) { - this.writer.write("\t\t\t\t" + p.getText()); - this.writer.write("\r\n"); - } + this.writer.write("\t\t\r\n"); - this.writer.write("\t\t\t\r\n"); - } + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(d.getRevisionID())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - this.writer.write("\t\t\r\n"); - this.writer.write("\t" + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); - } + this.writer.write("\t\t"); + this.writer.write(Integer.toString(d.getRevisionCounter())); + this.writer.write("\r\n"); - this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); - this.writer.flush(); - } + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); + this.writer.write(d.getTimeStamp().toString()); + this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + "\r\n"); - /** - * Writes the revision task to the output using wikipedia xml notation. - * - * @param task Reference to a revision task - * @throws IOException if an error occurs while writing the output - */ - public void writeRevision(final Task task) throws IOException { + this.writer.write("\t\t\r\n"); + partsCount = d.size(); + for (int j = 0; j < partsCount; j++) { - if (task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST - || task.getTaskType() == TaskTypes.TASK_FULL) { + p = d.get(j); + this.writer.write("\t\t\t\r\n"); - this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); + this.writer.write("\t\t\t\t" + p.getAction() + "\r\n"); + this.writer.write("\t\t\t\t" + p.getStart() + "\r\n"); + this.writer.write("\t\t\t\t" + p.getEnd() + "\r\n"); + if (p.getText() != null) { + this.writer.write("\t\t\t\t" + p.getText()); + this.writer.write("\r\n"); + } - ArticleInformation header = task.getHeader(); + this.writer.write("\t\t\t\r\n"); + } - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); - this.writer.write(header.getArticleName()); - this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); + this.writer.write("\t\t\r\n"); + this.writer.write("\t" + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); + } - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(header.getArticleId())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); + this.writer.flush(); } - Revision rev; - Iterator revIt = task.iterator(); - while (revIt.hasNext()) { - - this.writer.write("\t" + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + "\r\n"); - rev = revIt.next(); - - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(Integer.toString(rev.getRevisionID())); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - - this.writer.write("\t\t"); - this.writer.write(Integer.toString(rev.getRevisionCounter())); - this.writer.write("\r\n"); - - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); - this.writer.write(rev.getTimeStamp().toString()); - this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + "\r\n"); + /** + * Writes the revision task to the output using wikipedia xml notation. + * + * @param task + * Reference to a revision task + * @throws IOException + * if an error occurs while writing the output + */ + public void writeRevision(final Task task) throws IOException + { - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword()); - if (rev.contributorIsRegistered()) { - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_USERNAME.getKeyword()); - this.writer.write(rev.getContributorName()); - this.writer.write(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword() + "\r\n"); + if (task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST + || task.getTaskType() == TaskTypes.TASK_FULL) { - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); - this.writer.write(rev.getContributorId()); - this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); - } else { - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_IP.getKeyword()); - this.writer.write(rev.getContributorName()); - this.writer.write(WikipediaXMLKeys.KEY_END_IP.getKeyword() + "\r\n"); - } + this.writer.write(WikipediaXMLKeys.KEY_START_PAGE.getKeyword() + "\r\n"); - this.writer.write(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword() + "\r\n"); + ArticleInformation header = task.getHeader(); - if (rev.isMinor()) { - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword() + "\r\n"); - } + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_TITLE.getKeyword()); + this.writer.write(header.getArticleName()); + this.writer.write(WikipediaXMLKeys.KEY_END_TITLE.getKeyword() + "\r\n"); - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_COMMENT.getKeyword()); - this.writer.write(rev.getComment()); - this.writer.write(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword() + "\r\n"); + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(header.getArticleId())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + } + Revision rev; + Iterator revIt = task.iterator(); + while (revIt.hasNext()) { + + this.writer.write("\t" + WikipediaXMLKeys.KEY_START_REVISION.getKeyword() + "\r\n"); + rev = revIt.next(); + + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(Integer.toString(rev.getRevisionID())); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + + this.writer.write("\t\t"); + this.writer.write(Integer.toString(rev.getRevisionCounter())); + this.writer.write("\r\n"); + + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword()); + this.writer.write(rev.getTimeStamp().toString()); + this.writer.write(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword() + "\r\n"); + + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword()); + if (rev.contributorIsRegistered()) { + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_USERNAME.getKeyword()); + this.writer.write(rev.getContributorName()); + this.writer.write(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword() + "\r\n"); + + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_ID.getKeyword()); + this.writer.write(rev.getContributorId()); + this.writer.write(WikipediaXMLKeys.KEY_END_ID.getKeyword() + "\r\n"); + } + else { + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_IP.getKeyword()); + this.writer.write(rev.getContributorName()); + this.writer.write(WikipediaXMLKeys.KEY_END_IP.getKeyword() + "\r\n"); + } + + this.writer.write(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword() + "\r\n"); + + if (rev.isMinor()) { + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword() + "\r\n"); + } + + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_COMMENT.getKeyword()); + this.writer.write(rev.getComment()); + this.writer.write(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword() + "\r\n"); + + this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); + if (rev.getRevisionText() != null) { + this.writer.write(rev.getRevisionText()); + } + this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + "\r\n"); + + this.writer.write("\t" + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); + } - this.writer.write("\t\t" + WikipediaXMLKeys.KEY_START_TEXT.getKeyword()); - if (rev.getRevisionText() != null) { - this.writer.write(rev.getRevisionText()); - } - this.writer.write(WikipediaXMLKeys.KEY_END_TEXT.getKeyword() + "\r\n"); + if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST + || task.getTaskType() == TaskTypes.TASK_FULL) { - this.writer.write("\t" + WikipediaXMLKeys.KEY_END_REVISION.getKeyword() + "\r\n"); + this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); + } + this.writer.flush(); } - if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST || task.getTaskType() == TaskTypes.TASK_FULL) { - - this.writer.write(WikipediaXMLKeys.KEY_END_PAGE.getKeyword() + "\r\n"); + /** + * Closes the writer. + * + * @throws IOException + * if an error occurred while closing the writer + */ + public void close() throws IOException + { + this.writer.close(); } - this.writer.flush(); - } - - /** - * Closes the writer. - * - * @throws IOException if an error occurred while closing the writer - */ - public void close() throws IOException { - this.writer.close(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java index fbf05942..7986a455 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffTool.java @@ -28,50 +28,58 @@ /** * This class contains the start method for the DiffTool application. */ -public class DiffTool { +public class DiffTool +{ - /** - * Starts the DiffTool application. - * - * @param args program arguments args[0] has to be the path to the - * configuration file - */ - public static void main(final String[] args) { + /** + * Starts the DiffTool application. + * + * @param args + * program arguments args[0] has to be the path to the configuration file + */ + public static void main(final String[] args) + { - if (args.length != 1) { - throw new IllegalArgumentException( - "Configuration File ist missing."); - } + if (args.length != 1) { + throw new IllegalArgumentException("Configuration File ist missing."); + } - try { + try { - // Reads the configuration - ConfigSettings config = readConfiguration(args[0]); - new DiffToolThread(config).run(); - } catch (Exception e) { - e.printStackTrace(); + // Reads the configuration + ConfigSettings config = readConfiguration(args[0]); + new DiffToolThread(config).run(); + } + catch (Exception e) { + e.printStackTrace(); + } } - } - /** - * Reads and parses the configuration file. - * - * @param path path to the configuration file - * @return ConfigurationSettings - * @throws IOException if an error occurred while reading the configuration file - * @throws SAXException if an error occurred while using the xml parser - * @throws ParserConfigurationException if the initialization of the xml parser failed - */ - private static ConfigSettings readConfiguration(final String path) - throws IOException, SAXException, ParserConfigurationException { + /** + * Reads and parses the configuration file. + * + * @param path + * path to the configuration file + * @return ConfigurationSettings + * @throws IOException + * if an error occurred while reading the configuration file + * @throws SAXException + * if an error occurred while using the xml parser + * @throws ParserConfigurationException + * if the initialization of the xml parser failed + */ + private static ConfigSettings readConfiguration(final String path) + throws IOException, SAXException, ParserConfigurationException + { - ConfigurationReader reader = new ConfigurationReader(path); - return reader.read(); - } + ConfigurationReader reader = new ConfigurationReader(path); + return reader.read(); + } - /** - * No object - Utility class - */ - private DiffTool() { - } + /** + * No object - Utility class + */ + private DiffTool() + { + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java index be169225..e8c80a19 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/DiffToolThread.java @@ -63,284 +63,308 @@ /** * This class represents the main method for the DiffTool application */ -public class DiffToolThread extends Thread { - - /** - * Reference to the DiffTool Logger - */ - private static Logger logger; - - /** - * Reference to the Configuration - */ - private final ConfigurationManager cconfig; - - /** - * Configuration Parameter - Statistical output flag - */ - private boolean MODE_STATISTICAL_OUTPUT; - - /** - * (Constructor) Creates a DiffToolThread object. - * - * @param config Reference to the configuration - * @throws LoggingException if an error occurs while logging - */ - public DiffToolThread(final ConfigSettings config) - throws LoggingException { - - this.cconfig = new ConfigurationManager(config); - - try { - MODE_STATISTICAL_OUTPUT = (Boolean) cconfig - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - } catch (ConfigurationException e) { - MODE_STATISTICAL_OUTPUT = false; - } - - logger = LoggingFactory.createLogger(LoggerType.DIFF_TOOL, "DiffTool"); - } - - /** - * This class is used to receive tasks from the diff modules and transmits - * them to the sql modules. - */ - private class TaskTransmitter implements TaskTransmitterInterface { +public class DiffToolThread + extends Thread +{ /** - * Reference to the (dump) output writer + * Reference to the DiffTool Logger */ - private final WriterInterface dumpWriter; + private static Logger logger; /** - * Configuration Parameter - Output mode + * Reference to the Configuration */ - private final OutputType MODE_OUTPUT; + private final ConfigurationManager cconfig; /** * Configuration Parameter - Statistical output flag */ - private final boolean MODE_STATISTICAL_OUTPUT; + private boolean MODE_STATISTICAL_OUTPUT; /** - * Configuration Parameter - Datafile output flasg + * (Constructor) Creates a DiffToolThread object. + * + * @param config + * Reference to the configuration + * @throws LoggingException + * if an error occurs while logging */ - private final boolean MODE_DATAFILE_OUTPUT; + public DiffToolThread(final ConfigSettings config) throws LoggingException + { + + this.cconfig = new ConfigurationManager(config); + + try { + MODE_STATISTICAL_OUTPUT = (Boolean) cconfig + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + } + catch (ConfigurationException e) { + MODE_STATISTICAL_OUTPUT = false; + } + + logger = LoggingFactory.createLogger(LoggerType.DIFF_TOOL, "DiffTool"); + } /** - * (Constructor) Creates a TaskTransmitter object. - * - * @throws ConfigurationException if an error occurs while accessing the configuration - * @throws IOException if an error occurs while writing the output - * @throws LoggingException if an error occurs while logging + * This class is used to receive tasks from the diff modules and transmits them to the sql + * modules. */ - public TaskTransmitter() throws ConfigurationException, IOException, LoggingException { + private class TaskTransmitter + implements TaskTransmitterInterface + { + + /** + * Reference to the (dump) output writer + */ + private final WriterInterface dumpWriter; + + /** + * Configuration Parameter - Output mode + */ + private final OutputType MODE_OUTPUT; + + /** + * Configuration Parameter - Statistical output flag + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Configuration Parameter - Datafile output flasg + */ + private final boolean MODE_DATAFILE_OUTPUT; + + /** + * (Constructor) Creates a TaskTransmitter object. + * + * @throws ConfigurationException + * if an error occurs while accessing the configuration + * @throws IOException + * if an error occurs while writing the output + * @throws LoggingException + * if an error occurs while logging + */ + public TaskTransmitter() throws ConfigurationException, IOException, LoggingException + { + + ConfigurationManager config = ConfigurationManager.getInstance(); + + MODE_OUTPUT = (OutputType) config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + MODE_STATISTICAL_OUTPUT = (Boolean) cconfig + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + MODE_DATAFILE_OUTPUT = (Boolean) cconfig + .getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); + + switch (MODE_OUTPUT) { + + case UNCOMPRESSED: + if (MODE_DATAFILE_OUTPUT) { + this.dumpWriter = new DataFileWriter("output"); + } + else { + if (MODE_STATISTICAL_OUTPUT) { + this.dumpWriter = new TimedSQLFileWriter("output", logger); + } + else { + this.dumpWriter = new SQLFileWriter("output", logger); + } + } + break; + + case SEVENZIP: + case BZIP2: + case ALTERNATE: + if (MODE_DATAFILE_OUTPUT) { + this.dumpWriter = new DataFileArchiveWriter("output"); + } + else { + if (MODE_STATISTICAL_OUTPUT) { + this.dumpWriter = new TimedSQLArchiveWriter("output", logger); + } + else { + this.dumpWriter = new SQLArchiveWriter("output", logger); + } + } + break; + + case DATABASE: + if (MODE_DATAFILE_OUTPUT) { + throw ErrorFactory.createConfigurationException( + ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); + } + else { + if (MODE_STATISTICAL_OUTPUT) { + this.dumpWriter = new TimedSQLDatabaseWriter(logger); + } + else { + this.dumpWriter = new SQLDatabaseWriter(logger); + } + } + break; + + default: + throw ErrorFactory.createConfigurationException( + ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); + } + } - ConfigurationManager config = ConfigurationManager.getInstance(); + /** + * Receives a DiffTask Transmission. + */ + @Override + public void transmitDiff(final Task result) + { + writeOutput(result); + } - MODE_OUTPUT = (OutputType) config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - MODE_STATISTICAL_OUTPUT = (Boolean) cconfig.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - MODE_DATAFILE_OUTPUT = (Boolean) cconfig.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); + /** + * Receives a partial DiffTask Transmission. + */ + @Override + public void transmitPartialDiff(final Task result) + { + writeOutput(result); + } - switch (MODE_OUTPUT) { + @Override + public void close() throws IOException, SQLException + { + dumpWriter.close(); + } - case UNCOMPRESSED: - if (MODE_DATAFILE_OUTPUT) { - this.dumpWriter = new DataFileWriter("output"); - } else { - if (MODE_STATISTICAL_OUTPUT) { - this.dumpWriter = new TimedSQLFileWriter("output", logger); - } else { - this.dumpWriter = new SQLFileWriter("output", logger); + /** + * Forwards the DiffTask to the encoding modules. + * + * @param result + * Reference to a DiffTask + */ + private void writeOutput(final Task result) + { + + try { + long time, start = System.currentTimeMillis(); + dumpWriter.process(result); + + time = System.currentTimeMillis() - start; + + SQLConsumerLogMessages.logDiffProcessed(logger, result, time); + + // Output Encoding Error } - } - break; - - case SEVENZIP: - case BZIP2: - case ALTERNATE: - if (MODE_DATAFILE_OUTPUT) { - this.dumpWriter = new DataFileArchiveWriter("output"); - } else { - if (MODE_STATISTICAL_OUTPUT) { - this.dumpWriter = new TimedSQLArchiveWriter("output", logger); - } else { - this.dumpWriter = new SQLArchiveWriter("output", logger); + catch (SQLConsumerException e) { + + SQLConsumerLogMessages.logSQLConsumerException(logger, e); + e.printStackTrace(); + + // Critical Exceptions } - } - break; - - case DATABASE: - if (MODE_DATAFILE_OUTPUT) { - throw ErrorFactory - .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); - } else { - if (MODE_STATISTICAL_OUTPUT) { - this.dumpWriter = new TimedSQLDatabaseWriter(logger); - } else { - this.dumpWriter = new SQLDatabaseWriter(logger); + catch (ConfigurationException | IOException e) { + throw new RuntimeException(e); } - } - break; + } - default: - throw ErrorFactory - .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); - } } /** - * Receives a DiffTask Transmission. + * Runs the diff creation process */ @Override - public void transmitDiff(final Task result) { - writeOutput(result); - } - - /** - * Receives a partial DiffTask Transmission. - */ - @Override - public void transmitPartialDiff(final Task result) { - writeOutput(result); - } - - @Override - public void close() throws IOException, SQLException { - dumpWriter.close(); - } - - /** - * Forwards the DiffTask to the encoding modules. - * - * @param result Reference to a DiffTask - */ - private void writeOutput(final Task result) { + public void run() + { - try { - long time, start = System.currentTimeMillis(); - dumpWriter.process(result); + try { + ArchiveManager archives = new ArchiveManager(); + ArticleReaderInterface articleReader; + ArchiveDescription description = null; + Task task; + DiffCalculatorInterface diffCalc; - time = System.currentTimeMillis() - start; + if (MODE_STATISTICAL_OUTPUT) { + diffCalc = new TimedDiffCalculator(new TaskTransmitter()); + } + else { + diffCalc = new DiffCalculator(new TaskTransmitter()); + } - SQLConsumerLogMessages.logDiffProcessed(logger, result, time); + long start, time; - // Output Encoding Error - } catch (SQLConsumerException e) { + while (archives.hasArchive()) { - SQLConsumerLogMessages.logSQLConsumerException(logger, e); - e.printStackTrace(); + // Retrieve Archive + try { + description = archives.getArchive(); - // Critical Exceptions - } catch (ConfigurationException | IOException e) { - throw new RuntimeException(e); - } - } + // initialize filter + ArticleFilter nameFilter = new ArticleFilter(); + articleReader = InputFactory.getTaskReader(description, nameFilter); + ArticleConsumerLogMessages.logArchiveRetrieved(logger, description); - } + // Exception while accessing the archive + } + catch (ArticleReaderException e) { - /** - * Runs the diff creation process - */ - @Override - public void run() { + articleReader = null; + ArticleConsumerLogMessages.logExceptionRetrieveArchive(logger, description, e); + } - try { - ArchiveManager archives = new ArchiveManager(); - ArticleReaderInterface articleReader; - ArchiveDescription description = null; - Task task; - DiffCalculatorInterface diffCalc; + // Process Archive + while (articleReader != null) { + try { + if (articleReader.hasNext()) { - if (MODE_STATISTICAL_OUTPUT) { - diffCalc = new TimedDiffCalculator(new TaskTransmitter()); - } else { - diffCalc = new DiffCalculator(new TaskTransmitter()); - } + start = System.currentTimeMillis(); + // read the next article (may be null if filtered) + task = articleReader.next(); + time = System.currentTimeMillis() - start; - long start, time; + // task will be null if the name filter removed that + // article + if (task == null) { + continue; + } - while (archives.hasArchive()) { + ArticleConsumerLogMessages.logArticleRead(logger, task, time, + articleReader.getBytePosition()); + start = System.currentTimeMillis(); + // calculate the diff for this article version + diffCalc.process(task); + time = System.currentTimeMillis() - start; - // Retrieve Archive - try { - description = archives.getArchive(); + DiffConsumerLogMessages.logArticleProcessed(logger, task, time); - // initialize filter - ArticleFilter nameFilter = new ArticleFilter(); + } + else { + ArticleConsumerLogMessages.logNoMoreArticles(logger, description); + articleReader = null; + } - articleReader = InputFactory.getTaskReader(description, - nameFilter); - ArticleConsumerLogMessages.logArchiveRetrieved(logger, - description); + // Reset current article + } + catch (ArticleReaderException e) { - // Exception while accessing the archive - } catch (ArticleReaderException e) { + ArticleConsumerLogMessages.logTaskReaderException(logger, e); + articleReader.resetTaskCompleted(); - articleReader = null; - ArticleConsumerLogMessages.logExceptionRetrieveArchive( - logger, description, e); - } + } + catch (DiffException e) { - // Process Archive - while (articleReader != null) { - try { - if (articleReader.hasNext()) { - - start = System.currentTimeMillis(); - //read the next article (may be null if filtered) - task = articleReader.next(); - time = System.currentTimeMillis() - start; - - // task will be null if the name filter removed that - // article - if (task == null) { - continue; - } - - ArticleConsumerLogMessages - .logArticleRead(logger, task, time, - articleReader.getBytePosition()); - - start = System.currentTimeMillis(); - //calculate the diff for this article version - diffCalc.process(task); - time = System.currentTimeMillis() - start; - - DiffConsumerLogMessages.logArticleProcessed(logger, - task, time); - - } else { - ArticleConsumerLogMessages.logNoMoreArticles( - logger, description); - articleReader = null; + DiffConsumerLogMessages.logDiffException(logger, e); + articleReader.resetTaskCompleted(); + diffCalc.reset(); + } + } } + diffCalc.closeTransmitter(); - // Reset current article - } catch (ArticleReaderException e) { - - ArticleConsumerLogMessages.logTaskReaderException( - logger, e); - articleReader.resetTaskCompleted(); - - } catch (DiffException e) { + ArticleConsumerLogMessages.logNoMoreArchives(logger); - DiffConsumerLogMessages.logDiffException(logger, e); - articleReader.resetTaskCompleted(); - diffCalc.reset(); - } + // Critical Exceptions + } + catch (Exception e) { + DiffToolLogMessages.logException(logger, e); + throw new RuntimeException(e); } - } - diffCalc.closeTransmitter(); - - ArticleConsumerLogMessages.logNoMoreArchives(logger); - - // Critical Exceptions - } catch (Exception e) { - DiffToolLogMessages.logException(logger, e); - throw new RuntimeException(e); } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java index e2d0f1e9..8cb8c7e3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationKeys.java @@ -20,276 +20,269 @@ /** * Contains all applicable keys for the configuration file. */ -public enum ConfigurationKeys { - - /* - * +DIVERSES+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - - /** - * Type: SurrogateModes Used by: DiffCalculator, RevisionApi - *

- * Description: Surrogate Mode - */ - MODE_SURROGATES, - - /** - * Type: Integer Used by: SQLEncoder - *

- * Description: MaxAllowedPacket variable of the MySQL Server - */ - LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, - - /** - * Type: OutputMode Used by: SQLConsumer - *

- * Description: Output Mode - */ - MODE_OUTPUT, - - /** - * Type: boolean Used by: RevisionApi - *

- * Description: Enables the zip compression - */ - MODE_ZIP_COMPRESSION_ENABLED, - - /** - * Type: boolean Used by: RevisionApi - *

- * Description: Enables the binary output - */ - MODE_BINARY_OUTPUT_ENABLED, - - /** - * Type: boolean Used by: All Consumers and the processing components - *

- * Description: Enables the statistical output - */ - MODE_STATISTICAL_OUTPUT, - - /** - * Type: boolean - *

- * Description: Write datafiles instead of SQL dumps - */ - MODE_DATAFILE_OUTPUT, - - /** - * Type: boolean Used by: All Consumers and the processing components - *

- * Description: Enables the debug output - */ - MODE_DEBUG_OUTPUT, - - /** - * Type: String Used by: everybody - *

- * Description: Charset name of the input data - *

- * Recommendation / Default: "UTF-8" - */ - WIKIPEDIA_ENCODING, - - /** - * Type: Integer Range: > 1 Used by: DiffConsumers - Diff Generation - *

- * Description: This number indicates which revisions should be full - * revisions. - *

- * A full revision is generated if the result of the revisionCounter of the - * revision modulo COUNTER_FULL_REVISION is 0. - *

- * Recommendation / Default: Currenty a value of 1000 is used. - *

- * Example: COUNTER_FULL_REVISION = 100 - *

- * FullRevisions are all revisions with a revisionCounter % 100 == 0 0, 100, - * 200, 300, 400, ... - */ - COUNTER_FULL_REVISION, - - /** - * Type: Integer Range: > 1 Used by: DiffConsumers - Common Longest - * Substring Search - *

- * Description: This number indicates when a matching sequence between two - * revisions is considered as sequence. - *

- * Recommendation / Default: Currently 12 Value should be greater than the - * encoded size of an operation. - */ - VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, - - - /* - * +OUTPUT+VERIFICATION++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: String Used by: SQLConsumer - SQLFileWriter - *

- * Description: Output-Directory for the sql files - *

- * Recommendation / Default: No default value - has to be configured! - *

- * More consumers should lead to a speed-up - */ - PATH_OUTPUT_SQL_FILES, - - /** - * Type: Long Used by: SQLConsumer - SQLFileWriter - *

- * Description: Maximum size of an sql file (in bytes) - *

- * Recommendation / Default: Currently 100 MB - */ - LIMIT_SQL_FILE_SIZE, - - /** - * Type: Long Used by: SQLConsumer - SevenZipSQLWriter - *

- * Description: Maximum size of an sql archive file (in bytes) - *

- * Recommendation / Default: Currently not implemented - */ - LIMIT_SQL_ARCHIVE_SIZE, - - /** - * Type: Boolean Used by: DiffConsumer - DiffCalculator - *

- * Description: Enabels the verification of the diff generation - *

- * Recommendation / Default: Should only be used for debug purposes - */ - VERIFICATION_DIFF, - - /** - * Type: Boolean Used by: SQLConsumer - SQLFileWriter - *

- * Description: Enables the verification of the encoded revision data - *

- * Recommendation / Default: Should only be used for debug purposes - */ - VERIFICATION_ENCODING, - - - /* - * +RESOURCE+LIMITATIONS+++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: Long Used by: ArticleConsumers - *

- * Description: This value indicates the maximum size of an article task (in - * bytes). If the limit is reached the task will be splitted into parts. - *

- * Recommendation / Default: Currently 10 MB - *

- * USE WITH CAUTION! Large value could lead to a memory overflow - */ - LIMIT_TASK_SIZE_REVISIONS, - - /** - * Type: Long Used by: DiffConsumers - *

- * Description: This value indicates the maximum size of a diff task (in - * bytes). If the limit is reached the task will be splitted into parts. - *

- * Recommendation / Default: Currently 10 MB - *

- * USE WITH CAUTION! Large value could lead to a memory overflow - */ - LIMIT_TASK_SIZE_DIFFS, - - - /* - * +EXTERNAL+PROGRAMS++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: String Used by: ArticleConsumers - ArticleReader - InputFactory - *

- * Description: If you want to use 7Zip to decompress your 7z or bz2 - * archives set the corresponding path in the config file. - *

- * Recommendation / Default: not set, faster than bzip2 - */ - PATH_PROGRAM_7ZIP, - - - /* - * +UNCOMPRESSED+SERVER+SETTINGS++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - SQL_HOST, - - /** - * Type: String Used by: SQLConsumers - SQLDatabaseWriter - *

- * Description: Name of the sql database - *

- * Recommendation / Default: currently not used - */ - SQL_DATABASE, - - /** - * Type: String Used by: SQLConsumers - SQLDatabaseWriter - *

- * Description: Username of your sql producer - *

- * Recommendation / Default: currently not used - */ - SQL_USERNAME, - - /** - * Type: String Used by: SQLConsumers - SQLDatabaseWriter - *

- * Description: Password for the corresponding username - *

- * Recommendation / Default: currently not used - */ - SQL_PASSWORD, - - /* - * +LOGGING++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type: String Used by: All Loggers - *

- * Description: Root-Directory for all logger - *

- * Recommendation / Default: "logs/" - */ - LOGGING_PATH_DIFFTOOL, - - /** - * Type: String Used by: DiffConsumer, SQLConsumer - *

- * Description: Output directory for articles with failed verifications - *

- * Recommendation / Default: "logs/" + "debug/" - */ - LOGGING_PATH_DEBUG, - - /** - * Type: {@link org.slf4j.event.Level} Used by: DiffTool Logger - *

- * Description: Log level for the diff tool logger - *

- * Recommendation / Default: Log.INFO - *

- * Note that the corresponding output directory for the logger has to exist - * when the LogLevel is not Level.OFF - */ - LOGGING_LOGLEVEL_DIFFTOOL, - - /** - * Type: java.util.Set Used by: ArticleFilter - *

- * Description: The Set of namespaces to keep in output - */ - NAMESPACES_TO_KEEP +public enum ConfigurationKeys +{ + + /* + * +DIVERSES+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: SurrogateModes Used by: DiffCalculator, RevisionApi + *

+ * Description: Surrogate Mode + */ + MODE_SURROGATES, + + /** + * Type: Integer Used by: SQLEncoder + *

+ * Description: MaxAllowedPacket variable of the MySQL Server + */ + LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, + + /** + * Type: OutputMode Used by: SQLConsumer + *

+ * Description: Output Mode + */ + MODE_OUTPUT, + + /** + * Type: boolean Used by: RevisionApi + *

+ * Description: Enables the zip compression + */ + MODE_ZIP_COMPRESSION_ENABLED, + + /** + * Type: boolean Used by: RevisionApi + *

+ * Description: Enables the binary output + */ + MODE_BINARY_OUTPUT_ENABLED, + + /** + * Type: boolean Used by: All Consumers and the processing components + *

+ * Description: Enables the statistical output + */ + MODE_STATISTICAL_OUTPUT, + + /** + * Type: boolean + *

+ * Description: Write datafiles instead of SQL dumps + */ + MODE_DATAFILE_OUTPUT, + + /** + * Type: boolean Used by: All Consumers and the processing components + *

+ * Description: Enables the debug output + */ + MODE_DEBUG_OUTPUT, + + /** + * Type: String Used by: everybody + *

+ * Description: Charset name of the input data + *

+ * Recommendation / Default: "UTF-8" + */ + WIKIPEDIA_ENCODING, + + /** + * Type: Integer Range: > 1 Used by: DiffConsumers - Diff Generation + *

+ * Description: This number indicates which revisions should be full revisions. + *

+ * A full revision is generated if the result of the revisionCounter of the revision modulo + * COUNTER_FULL_REVISION is 0. + *

+ * Recommendation / Default: Currenty a value of 1000 is used. + *

+ * Example: COUNTER_FULL_REVISION = 100 + *

+ * FullRevisions are all revisions with a revisionCounter % 100 == 0 0, 100, 200, 300, 400, ... + */ + COUNTER_FULL_REVISION, + + /** + * Type: Integer Range: > 1 Used by: DiffConsumers - Common Longest Substring Search + *

+ * Description: This number indicates when a matching sequence between two revisions is + * considered as sequence. + *

+ * Recommendation / Default: Currently 12 Value should be greater than the encoded size of an + * operation. + */ + VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, + + /* + * +OUTPUT+VERIFICATION++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: String Used by: SQLConsumer - SQLFileWriter + *

+ * Description: Output-Directory for the sql files + *

+ * Recommendation / Default: No default value - has to be configured! + *

+ * More consumers should lead to a speed-up + */ + PATH_OUTPUT_SQL_FILES, + + /** + * Type: Long Used by: SQLConsumer - SQLFileWriter + *

+ * Description: Maximum size of an sql file (in bytes) + *

+ * Recommendation / Default: Currently 100 MB + */ + LIMIT_SQL_FILE_SIZE, + + /** + * Type: Long Used by: SQLConsumer - SevenZipSQLWriter + *

+ * Description: Maximum size of an sql archive file (in bytes) + *

+ * Recommendation / Default: Currently not implemented + */ + LIMIT_SQL_ARCHIVE_SIZE, + + /** + * Type: Boolean Used by: DiffConsumer - DiffCalculator + *

+ * Description: Enabels the verification of the diff generation + *

+ * Recommendation / Default: Should only be used for debug purposes + */ + VERIFICATION_DIFF, + + /** + * Type: Boolean Used by: SQLConsumer - SQLFileWriter + *

+ * Description: Enables the verification of the encoded revision data + *

+ * Recommendation / Default: Should only be used for debug purposes + */ + VERIFICATION_ENCODING, + + /* + * +RESOURCE+LIMITATIONS+++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: Long Used by: ArticleConsumers + *

+ * Description: This value indicates the maximum size of an article task (in bytes). If the + * limit is reached the task will be splitted into parts. + *

+ * Recommendation / Default: Currently 10 MB + *

+ * USE WITH CAUTION! Large value could lead to a memory overflow + */ + LIMIT_TASK_SIZE_REVISIONS, + + /** + * Type: Long Used by: DiffConsumers + *

+ * Description: This value indicates the maximum size of a diff task (in bytes). If the limit is + * reached the task will be splitted into parts. + *

+ * Recommendation / Default: Currently 10 MB + *

+ * USE WITH CAUTION! Large value could lead to a memory overflow + */ + LIMIT_TASK_SIZE_DIFFS, + + /* + * +EXTERNAL+PROGRAMS++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: String Used by: ArticleConsumers - ArticleReader - InputFactory + *

+ * Description: If you want to use 7Zip to decompress your 7z or bz2 archives set the + * corresponding path in the config file. + *

+ * Recommendation / Default: not set, faster than bzip2 + */ + PATH_PROGRAM_7ZIP, + + /* + * +UNCOMPRESSED+SERVER+SETTINGS++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + SQL_HOST, + + /** + * Type: String Used by: SQLConsumers - SQLDatabaseWriter + *

+ * Description: Name of the sql database + *

+ * Recommendation / Default: currently not used + */ + SQL_DATABASE, + + /** + * Type: String Used by: SQLConsumers - SQLDatabaseWriter + *

+ * Description: Username of your sql producer + *

+ * Recommendation / Default: currently not used + */ + SQL_USERNAME, + + /** + * Type: String Used by: SQLConsumers - SQLDatabaseWriter + *

+ * Description: Password for the corresponding username + *

+ * Recommendation / Default: currently not used + */ + SQL_PASSWORD, + + /* + * +LOGGING++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type: String Used by: All Loggers + *

+ * Description: Root-Directory for all logger + *

+ * Recommendation / Default: "logs/" + */ + LOGGING_PATH_DIFFTOOL, + + /** + * Type: String Used by: DiffConsumer, SQLConsumer + *

+ * Description: Output directory for articles with failed verifications + *

+ * Recommendation / Default: "logs/" + "debug/" + */ + LOGGING_PATH_DEBUG, + + /** + * Type: {@link org.slf4j.event.Level} Used by: DiffTool Logger + *

+ * Description: Log level for the diff tool logger + *

+ * Recommendation / Default: Log.INFO + *

+ * Note that the corresponding output directory for the logger has to exist when the LogLevel is + * not Level.OFF + */ + LOGGING_LOGLEVEL_DIFFTOOL, + + /** + * Type: java.util.Set Used by: ArticleFilter + *

+ * Description: The Set of namespaces to keep in output + */ + NAMESPACES_TO_KEEP } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java index a8b7ba05..8600563c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationManager.java @@ -29,92 +29,112 @@ /** * Singleton - Manages the configuration settings for the DiffTool. */ -public class ConfigurationManager { +public class ConfigurationManager +{ - /** - * Reference to the created instance - */ - private static ConfigurationManager instance; + /** + * Reference to the created instance + */ + private static ConfigurationManager instance; - /** - * Returns the reference to the instance of the ConfigurationManager. - * - * @return ConfigurationManager - * @throws ConfigurationException if the ConfigurationManager has not been created during the - * startup of the application. - */ - public static ConfigurationManager getInstance() throws ConfigurationException { + /** + * Returns the reference to the instance of the ConfigurationManager. + * + * @return ConfigurationManager + * @throws ConfigurationException + * if the ConfigurationManager has not been created during the startup of the + * application. + */ + public static ConfigurationManager getInstance() throws ConfigurationException + { - if (instance == null) { - throw ErrorFactory.createConfigurationException(ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED); + if (instance == null) { + throw ErrorFactory.createConfigurationException( + ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_NOT_INITIALIZED); + } + return instance; } - return instance; - } - /** - * Reference to the ConfigurationSettings - */ - private final ConfigSettings config; + /** + * Reference to the ConfigurationSettings + */ + private final ConfigSettings config; - /** - * (Constructor) Creates the Configuration Manager - This constructor should - * only be called during the startup of the DiffTool Application. - * - * @param config Reference to the ConfigurationSettings - */ - public ConfigurationManager(final ConfigSettings config) { - instance = this; - this.config = config; - } + /** + * (Constructor) Creates the Configuration Manager - This constructor should only be called + * during the startup of the DiffTool Application. + * + * @param config + * Reference to the ConfigurationSettings + */ + public ConfigurationManager(final ConfigSettings config) + { + instance = this; + this.config = config; + } - /** - * Returns the list of input archives. - * - * @return list of input archives - */ - public List getArchiveList() { - return this.config.getArchiveList(); - } + /** + * Returns the list of input archives. + * + * @return list of input archives + */ + public List getArchiveList() + { + return this.config.getArchiveList(); + } - /** - * Returns the value of the configuration parameter. - * - * @param configParameter Key for the configuration parameter. - * @return Value of the configuration parameter - * @throws ConfigurationException if the configuration value was not defined or was not set. - */ - public Object getConfigParameter(final ConfigurationKeys configParameter) throws ConfigurationException { + /** + * Returns the value of the configuration parameter. + * + * @param configParameter + * Key for the configuration parameter. + * @return Value of the configuration parameter + * @throws ConfigurationException + * if the configuration value was not defined or was not set. + */ + public Object getConfigParameter(final ConfigurationKeys configParameter) + throws ConfigurationException + { - Object o = this.config.getConfigParameter(configParameter); - if (o != null) { - return o; - } - //return standard values for some of the parameters if they - //are not set in the configuration - //this is only done for uncritical settings, e.g. debug or logging - //For other parameters, missing settings will produce an exception - else if (configParameter == ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE) { - return Long.MAX_VALUE; - } else if (configParameter == ConfigurationKeys.LIMIT_SQL_FILE_SIZE) { - return Long.MAX_VALUE; - } else if (configParameter == ConfigurationKeys.MODE_STATISTICAL_OUTPUT) { - return false; - } else if (configParameter == ConfigurationKeys.MODE_DEBUG_OUTPUT) { - return false; - } else if (configParameter == ConfigurationKeys.VERIFICATION_ENCODING) { - return false; - } else if (configParameter == ConfigurationKeys.VERIFICATION_DIFF) { - return false; - } else if (configParameter == ConfigurationKeys.LOGGING_PATH_DEBUG) { - return ""; - } else if (configParameter == ConfigurationKeys.NAMESPACES_TO_KEEP) { - return new HashSet(); - } else if (configParameter == ConfigurationKeys.MODE_DATAFILE_OUTPUT) { - return false; - } else { - throw ErrorFactory.createConfigurationException( - ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, - configParameter.toString()); + Object o = this.config.getConfigParameter(configParameter); + if (o != null) { + return o; + } + // return standard values for some of the parameters if they + // are not set in the configuration + // this is only done for uncritical settings, e.g. debug or logging + // For other parameters, missing settings will produce an exception + else if (configParameter == ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE) { + return Long.MAX_VALUE; + } + else if (configParameter == ConfigurationKeys.LIMIT_SQL_FILE_SIZE) { + return Long.MAX_VALUE; + } + else if (configParameter == ConfigurationKeys.MODE_STATISTICAL_OUTPUT) { + return false; + } + else if (configParameter == ConfigurationKeys.MODE_DEBUG_OUTPUT) { + return false; + } + else if (configParameter == ConfigurationKeys.VERIFICATION_ENCODING) { + return false; + } + else if (configParameter == ConfigurationKeys.VERIFICATION_DIFF) { + return false; + } + else if (configParameter == ConfigurationKeys.LOGGING_PATH_DEBUG) { + return ""; + } + else if (configParameter == ConfigurationKeys.NAMESPACES_TO_KEEP) { + return new HashSet(); + } + else if (configParameter == ConfigurationKeys.MODE_DATAFILE_OUTPUT) { + return false; + } + else { + throw ErrorFactory.createConfigurationException( + ErrorKeys.CONFIGURATION_CONFIGURATIONMANAGER_UNKNOWN_CONFIG_PARAMETER, + configParameter.toString()); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java index cd264abd..0df10060 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/ConfigurationReader.java @@ -41,818 +41,847 @@ /** * This Reader reads the xml-configuration files for the DiffTool. */ -public class ConfigurationReader { - - /** - * XML tree root node - */ - private final Element root; - - /** - * Section identifier - Mode - */ - private final String SECTION_MODE = "VALUES"; - - /** - * Key identifier - Mode >> Minimum longest common substring - */ - private final String KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = "VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING"; - - /** - * Key identifier - Mode >> full revision counter - */ - private final String KEY_COUNTER_FULL_REVISION = "COUNTER_FULL_REVISION"; - - /** - * Section identifier - Externals - */ - private final String SECTION_EXTERNALS = "EXTERNALS"; - - /** - * Key identifier - Externals >> SevenZip - */ - private final String KEY_SEVENZIP = "SEVENZIP"; - - /** - * Section identifier - Input - */ - private final String SECTION_INPUT = "INPUT"; - - /** - * Key identifier - Input >> Surrogates Mode - */ - private final String KEY_MODE_SURROGATES = "MODE_SURROGATES"; - - /** - * Key identifier - Input >> Wikipedia Encoding - */ - private final String KEY_WIKIPEDIA_ENCODING = "WIKIPEDIA_ENCODING"; - - /** - * Subsection identifier - Input -> Archive - */ - private final String SUBSECTION_ARCHIVE = "ARCHIVE"; - - /** - * Key identifier - Input -> Archive >> Type - */ - private final String KEY_TYPE = "TYPE"; - - /** - * Key identifier - Input -> Archive >> Path - */ - private final String KEY_PATH = "PATH"; - - /** - * Key identifier - Input -> Archive >> Start - */ - private final String KEY_START = "START"; - - /** - * Section identifier - Output - */ - private final String SECTION_OUTPUT = "OUTPUT"; - - /** - * Key identifier - Output >> MODE - */ - private final String KEY_OUTPUT_MODE = "OUTPUT_MODE"; - - /** - * Key identifier - Output >> MODE >> UNCOMPRESSED File Size - */ - private final String KEY_LIMIT_SQL_FILE_SIZE = "LIMIT_SQL_FILE_SIZE"; - - /** - * Key identifier - Output >> Enable Datafile - */ - private final String KEY_OUTPUT_DATAFILE = "MODE_DATAFILE_OUTPUT"; - - /** - * Key identifier - Output >> MODE >> UNCOMPRESSED Archive Size - */ - private final String KEY_LIMIT_SQL_ARCHIVE_SIZE = "LIMIT_SQL_ARCHIVE_SIZE"; - - /** - * Key identifier - Output >> MODE >> Zip-Compression enabled - */ - private final String KEY_MODE_ZIP_COMPRESSION_ENABLED = "MODE_ZIP_COMPRESSION_ENABLED"; - - /** - * Key identifier - Output >> MODE >> Binary output enabled - */ - private final String KEY_MODE_BINARY_OUTPUT_ENABLED = "MODE_BINARY_OUTPUT_ENABLED"; - - /** - * Subsection identifier - Output -> UNCOMPRESSED - */ - private final String SUBSECTION_SQL = "UNCOMPRESSED"; - - /** - * Key identifier - Output -> UNCOMPRESSED >> Host - */ - private final String KEY_HOST = "HOST"; - - /** - * Key identifier - Output -> UNCOMPRESSED >> Database - */ - private final String KEY_DATABASE = "DATABASE"; - - /** - * Key identifier - Output -> UNCOMPRESSED >> User - */ - private final String KEY_USER = "USER"; - - /** - * Key identifier - Output -> UNCOMPRESSED >> Password - */ - private final String KEY_PASSWORD = "PASSWORD"; - - /** - * Section identifier - Cache - */ - private final String SECTION_CACHE = "CACHE"; - - /** - * Key identifier - Cache >> Task Size Revisions - */ - private final String KEY_LIMIT_TASK_SIZE_REVISIONS = "LIMIT_TASK_SIZE_REVISIONS"; - - /** - * Key identifier - Cache >> Task Size Diff - */ - private final String KEY_LIMIT_TASK_SIZE_DIFFS = "LIMIT_TASK_SIZE_DIFFS"; - - /** - * Key identifier - Cache >> SQLProducer MAXALLOWEDPACKET - */ - private final String KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET = "LIMIT_SQLSERVER_MAX_ALLOWED_PACKET"; - - /** - * Section identifier - Logging - */ - private final String SECTION_LOGGING = "LOGGING"; - - /** - * Section identifier - Logging >> Root folder - */ - private final String KEY_ROOT_FOLDER = "ROOT_FOLDER"; - - /** - * Subsection identifier - Logging -> DiffTool - */ - private final String SUBSUBSECTION_DIFF_TOOL = "DIFF_TOOL"; - - /** - * Key identifier - Logging -> ... >> Level - */ - private final String KEY_LOG_LEVEL = "LEVEL"; - - /** - * Key identifier - Logging -> ... >> Path - */ - private final String KEY_LOG_PATH = "PATH"; - - /** - * Section identifier - Debug - */ - private final String SECTION_DEBUG = "DEBUG"; - - /** - * Key identifier - Debug -> Output >> Verification Diff - */ - private final String KEY_VERIFICATION_DIFF = "VERIFICATION_DIFF"; - - /** - * Key identifier - Debug -> Output >> Verification Encoding - */ - private final String KEY_VERIFICATION_ENCODING = "VERIFICATION_ENCODING"; - - /** - * Key identifier - Debug -> Output >> Statistical - */ - private final String KEY_STATISTICAL_OUTPUT = "STATISTICAL_OUTPUT"; - - /** - * Subsection identifier - Debug -> Output - */ - private final String SUBSECTION_DEBUG_OUTPUT = "DEBUG_OUTPUT"; - - /** - * Key identifier - Debug -> Output >> Enabled - */ - private final String KEY_DEBUG_ENABLED = "ENABLED"; - - /** - * Key identifier - Debug -> Output >> Path - */ - private final String KEY_DEBUG_PATH = "PATH"; - - /** - * Section identifier - filter - */ - private final String SECTION_FILTER = "FILTER"; - - /** - * Subsection identifier - filter -> namespaces - */ - private final String SUBSECTION_FILTER_NAMESPACES = "NAMESPACES"; - - /** - * Key identifier - filter -> namespaces >> ns - */ - private final String NAMESPACE_TO_KEEP = "NS"; - - /** - * (Constructor) Creates a new ConfigurationReader object. - * - * @param path - * @throws IOException if an error occurs while reading the file - * @throws SAXException if an error occurs while building the document - * @throws ParserConfigurationException if an error occurs while parsing the document - */ - public ConfigurationReader(final String path) - throws IOException, SAXException, ParserConfigurationException { - - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - - DocumentBuilder loader = factory.newDocumentBuilder(); - - Document document = loader.parse(path); - root = document.getDocumentElement(); - } - - /** - * Reads the input of the configuration file and parses the into the - * ConfigSettings object. - * - * @return ConfigSettings - */ - public ConfigSettings read() { - - ConfigSettings config = new ConfigSettings(ConfigEnum.IMPORT); - - String name; - Node node; - NodeList list = root.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - node = list.item(i); - - name = node.getNodeName().toUpperCase(); - - if (name.equals(SECTION_MODE)) { - parseModeConfig(node, config); - } else if (name.equals(SECTION_EXTERNALS)) { - parseExternalsConfig(node, config); - } else if (name.equals(SECTION_INPUT)) { - parseInputConfig(node, config); - } else if (name.equals(SECTION_OUTPUT)) { - parseOutputConfig(node, config); - } else if (name.equals(SECTION_CACHE)) { - parseCacheConfig(node, config); - } else if (name.equals(SECTION_LOGGING)) { - parseLoggingConfig(node, config); - } else if (name.equals(SECTION_DEBUG)) { - parseDebugConfig(node, config); - } else if (name.equals(SECTION_FILTER)) { - parseFilterConfig(node, config); - } +public class ConfigurationReader +{ + + /** + * XML tree root node + */ + private final Element root; + + /** + * Section identifier - Mode + */ + private final String SECTION_MODE = "VALUES"; + + /** + * Key identifier - Mode >> Minimum longest common substring + */ + private final String KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = "VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING"; + + /** + * Key identifier - Mode >> full revision counter + */ + private final String KEY_COUNTER_FULL_REVISION = "COUNTER_FULL_REVISION"; + + /** + * Section identifier - Externals + */ + private final String SECTION_EXTERNALS = "EXTERNALS"; + + /** + * Key identifier - Externals >> SevenZip + */ + private final String KEY_SEVENZIP = "SEVENZIP"; + + /** + * Section identifier - Input + */ + private final String SECTION_INPUT = "INPUT"; + + /** + * Key identifier - Input >> Surrogates Mode + */ + private final String KEY_MODE_SURROGATES = "MODE_SURROGATES"; + + /** + * Key identifier - Input >> Wikipedia Encoding + */ + private final String KEY_WIKIPEDIA_ENCODING = "WIKIPEDIA_ENCODING"; + + /** + * Subsection identifier - Input -> Archive + */ + private final String SUBSECTION_ARCHIVE = "ARCHIVE"; + + /** + * Key identifier - Input -> Archive >> Type + */ + private final String KEY_TYPE = "TYPE"; + + /** + * Key identifier - Input -> Archive >> Path + */ + private final String KEY_PATH = "PATH"; + + /** + * Key identifier - Input -> Archive >> Start + */ + private final String KEY_START = "START"; + + /** + * Section identifier - Output + */ + private final String SECTION_OUTPUT = "OUTPUT"; + + /** + * Key identifier - Output >> MODE + */ + private final String KEY_OUTPUT_MODE = "OUTPUT_MODE"; + + /** + * Key identifier - Output >> MODE >> UNCOMPRESSED File Size + */ + private final String KEY_LIMIT_SQL_FILE_SIZE = "LIMIT_SQL_FILE_SIZE"; + + /** + * Key identifier - Output >> Enable Datafile + */ + private final String KEY_OUTPUT_DATAFILE = "MODE_DATAFILE_OUTPUT"; + + /** + * Key identifier - Output >> MODE >> UNCOMPRESSED Archive Size + */ + private final String KEY_LIMIT_SQL_ARCHIVE_SIZE = "LIMIT_SQL_ARCHIVE_SIZE"; + + /** + * Key identifier - Output >> MODE >> Zip-Compression enabled + */ + private final String KEY_MODE_ZIP_COMPRESSION_ENABLED = "MODE_ZIP_COMPRESSION_ENABLED"; + + /** + * Key identifier - Output >> MODE >> Binary output enabled + */ + private final String KEY_MODE_BINARY_OUTPUT_ENABLED = "MODE_BINARY_OUTPUT_ENABLED"; + + /** + * Subsection identifier - Output -> UNCOMPRESSED + */ + private final String SUBSECTION_SQL = "UNCOMPRESSED"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> Host + */ + private final String KEY_HOST = "HOST"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> Database + */ + private final String KEY_DATABASE = "DATABASE"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> User + */ + private final String KEY_USER = "USER"; + + /** + * Key identifier - Output -> UNCOMPRESSED >> Password + */ + private final String KEY_PASSWORD = "PASSWORD"; + + /** + * Section identifier - Cache + */ + private final String SECTION_CACHE = "CACHE"; + + /** + * Key identifier - Cache >> Task Size Revisions + */ + private final String KEY_LIMIT_TASK_SIZE_REVISIONS = "LIMIT_TASK_SIZE_REVISIONS"; + + /** + * Key identifier - Cache >> Task Size Diff + */ + private final String KEY_LIMIT_TASK_SIZE_DIFFS = "LIMIT_TASK_SIZE_DIFFS"; + + /** + * Key identifier - Cache >> SQLProducer MAXALLOWEDPACKET + */ + private final String KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET = "LIMIT_SQLSERVER_MAX_ALLOWED_PACKET"; + + /** + * Section identifier - Logging + */ + private final String SECTION_LOGGING = "LOGGING"; + + /** + * Section identifier - Logging >> Root folder + */ + private final String KEY_ROOT_FOLDER = "ROOT_FOLDER"; + + /** + * Subsection identifier - Logging -> DiffTool + */ + private final String SUBSUBSECTION_DIFF_TOOL = "DIFF_TOOL"; + + /** + * Key identifier - Logging -> ... >> Level + */ + private final String KEY_LOG_LEVEL = "LEVEL"; + + /** + * Key identifier - Logging -> ... >> Path + */ + private final String KEY_LOG_PATH = "PATH"; + + /** + * Section identifier - Debug + */ + private final String SECTION_DEBUG = "DEBUG"; + + /** + * Key identifier - Debug -> Output >> Verification Diff + */ + private final String KEY_VERIFICATION_DIFF = "VERIFICATION_DIFF"; + + /** + * Key identifier - Debug -> Output >> Verification Encoding + */ + private final String KEY_VERIFICATION_ENCODING = "VERIFICATION_ENCODING"; + + /** + * Key identifier - Debug -> Output >> Statistical + */ + private final String KEY_STATISTICAL_OUTPUT = "STATISTICAL_OUTPUT"; + + /** + * Subsection identifier - Debug -> Output + */ + private final String SUBSECTION_DEBUG_OUTPUT = "DEBUG_OUTPUT"; + + /** + * Key identifier - Debug -> Output >> Enabled + */ + private final String KEY_DEBUG_ENABLED = "ENABLED"; + + /** + * Key identifier - Debug -> Output >> Path + */ + private final String KEY_DEBUG_PATH = "PATH"; + + /** + * Section identifier - filter + */ + private final String SECTION_FILTER = "FILTER"; + + /** + * Subsection identifier - filter -> namespaces + */ + private final String SUBSECTION_FILTER_NAMESPACES = "NAMESPACES"; + + /** + * Key identifier - filter -> namespaces >> ns + */ + private final String NAMESPACE_TO_KEEP = "NS"; + + /** + * (Constructor) Creates a new ConfigurationReader object. + * + * @param path + * @throws IOException + * if an error occurs while reading the file + * @throws SAXException + * if an error occurs while building the document + * @throws ParserConfigurationException + * if an error occurs while parsing the document + */ + public ConfigurationReader(final String path) + throws IOException, SAXException, ParserConfigurationException + { + + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + + DocumentBuilder loader = factory.newDocumentBuilder(); + + Document document = loader.parse(path); + root = document.getDocumentElement(); } - return config; - } - - - /** - * Parses the filter parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseFilterConfig(final Node node, final ConfigSettings config) { - - String name; - Node nnode; - final NodeList list = node.getChildNodes(); - final int length = list.getLength(); - - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - - if (name.equals(SUBSECTION_FILTER_NAMESPACES)) { - parseNamespaceFilterConfig(nnode, config); - } - + /** + * Reads the input of the configuration file and parses the into the ConfigSettings object. + * + * @return ConfigSettings + */ + public ConfigSettings read() + { + + ConfigSettings config = new ConfigSettings(ConfigEnum.IMPORT); + + String name; + Node node; + NodeList list = root.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + node = list.item(i); + + name = node.getNodeName().toUpperCase(); + + if (name.equals(SECTION_MODE)) { + parseModeConfig(node, config); + } + else if (name.equals(SECTION_EXTERNALS)) { + parseExternalsConfig(node, config); + } + else if (name.equals(SECTION_INPUT)) { + parseInputConfig(node, config); + } + else if (name.equals(SECTION_OUTPUT)) { + parseOutputConfig(node, config); + } + else if (name.equals(SECTION_CACHE)) { + parseCacheConfig(node, config); + } + else if (name.equals(SECTION_LOGGING)) { + parseLoggingConfig(node, config); + } + else if (name.equals(SECTION_DEBUG)) { + parseDebugConfig(node, config); + } + else if (name.equals(SECTION_FILTER)) { + parseFilterConfig(node, config); + } + } + + return config; } - } - - /** - * Parses the namespaces parameter section. This is the subsection of filter. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseNamespaceFilterConfig(final Node node, final ConfigSettings config) { - String name; - Integer value; - Node nnode; - final NodeList list = node.getChildNodes(); - final int length = list.getLength(); - final Set namespaces = new HashSet<>(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + /** + * Parses the filter parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseFilterConfig(final Node node, final ConfigSettings config) + { - name = nnode.getNodeName().toUpperCase(); - if (name.equals(NAMESPACE_TO_KEEP)) { + String name; + Node nnode; + final NodeList list = node.getChildNodes(); + final int length = list.getLength(); - value = Integer.parseInt(nnode.getChildNodes().item(0) - .getNodeValue()); - namespaces.add(value); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + name = nnode.getNodeName().toUpperCase(); - } + if (name.equals(SUBSECTION_FILTER_NAMESPACES)) { + parseNamespaceFilterConfig(nnode, config); + } + } } - config.setConfigParameter( - ConfigurationKeys.NAMESPACES_TO_KEEP, - namespaces); - - } + /** + * Parses the namespaces parameter section. This is the subsection of filter. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseNamespaceFilterConfig(final Node node, final ConfigSettings config) + { + String name; + Integer value; + Node nnode; + final NodeList list = node.getChildNodes(); + final int length = list.getLength(); + final Set namespaces = new HashSet<>(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - /** - * Parses the mode parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseModeConfig(final Node node, final ConfigSettings config) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(NAMESPACE_TO_KEEP)) { - String name; - Integer value; - Node nnode; - NodeList list = node.getChildNodes(); + value = Integer.parseInt(nnode.getChildNodes().item(0).getNodeValue()); + namespaces.add(value); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + } - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING)) { + } - value = Integer.parseInt(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, - value); + config.setConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP, namespaces); - } else if (name.equals(KEY_COUNTER_FULL_REVISION)) { - - value = Integer.parseInt(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.COUNTER_FULL_REVISION, value); + } - } + /** + * Parses the mode parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseModeConfig(final Node node, final ConfigSettings config) + { + + String name; + Integer value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING)) { + + value = Integer.parseInt(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, + value); + + } + else if (name.equals(KEY_COUNTER_FULL_REVISION)) { + + value = Integer.parseInt(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION, value); + + } + } } - } - /** - * Parses the externals parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseExternalsConfig(final Node node, - final ConfigSettings config) { + /** + * Parses the externals parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseExternalsConfig(final Node node, final ConfigSettings config) + { - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_SEVENZIP)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_SEVENZIP)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); - config.setConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP, - value); + config.setConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP, value); - } + } + } } - } - /** - * Parses the input parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseInputConfig(final Node node, final ConfigSettings config) { + /** + * Parses the input parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseInputConfig(final Node node, final ConfigSettings config) + { - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_WIKIPEDIA_ENCODING)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_WIKIPEDIA_ENCODING)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, - value); + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, value); - } else if (name.equals(KEY_MODE_SURROGATES)) { + } + else if (name.equals(KEY_MODE_SURROGATES)) { - SurrogateModes oValue = SurrogateModes.parse(nnode - .getChildNodes().item(0).getNodeValue()); - config.setConfigParameter(ConfigurationKeys.MODE_SURROGATES, - oValue); + SurrogateModes oValue = SurrogateModes + .parse(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_SURROGATES, oValue); - } else if (name.equals(SUBSECTION_ARCHIVE)) { + } + else if (name.equals(SUBSECTION_ARCHIVE)) { - parseInputArchive(nnode, config); + parseInputArchive(nnode, config); - } + } + } } - } - /** - * Parses the input archive subsection. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseInputArchive(final Node node, final ConfigSettings config) { + /** + * Parses the input archive subsection. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseInputArchive(final Node node, final ConfigSettings config) + { - String name; + String name; - InputType type = null; - String path = null; - long startPosition = 0; + InputType type = null; + String path = null; + long startPosition = 0; - Node nnode; - NodeList list = node.getChildNodes(); + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_TYPE)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_TYPE)) { - type = InputType.parse(nnode.getChildNodes().item(0) - .getNodeValue()); + type = InputType.parse(nnode.getChildNodes().item(0).getNodeValue()); - } else if (name.equals(KEY_PATH)) { + } + else if (name.equals(KEY_PATH)) { - path = nnode.getChildNodes().item(0).getNodeValue(); - path = path.substring(1, path.length() - 1); + path = nnode.getChildNodes().item(0).getNodeValue(); + path = path.substring(1, path.length() - 1); - } else if (name.equals(KEY_START)) { + } + else if (name.equals(KEY_START)) { - startPosition = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); + startPosition = Long.parseLong(nnode.getChildNodes().item(0).getNodeValue()); - } - } + } + } - if (type == null || path == null) { - throw new IllegalArgumentException("Illegal Archive Description"); - } + if (type == null || path == null) { + throw new IllegalArgumentException("Illegal Archive Description"); + } - ArchiveDescription archive = new ArchiveDescription(type, path); - if (startPosition > 0) { - archive.setStartPosition(startPosition); - } + ArchiveDescription archive = new ArchiveDescription(type, path); + if (startPosition > 0) { + archive.setStartPosition(startPosition); + } - config.add(archive); - } + config.add(archive); + } - /** - * Parses the output parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseOutputConfig(final Node node, final ConfigSettings config) { + /** + * Parses the output parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseOutputConfig(final Node node, final ConfigSettings config) + { - String name; - Long lValue; - Boolean bValue; - Node nnode; - NodeList list = node.getChildNodes(); + String name; + Long lValue; + Boolean bValue; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_OUTPUT_MODE)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_OUTPUT_MODE)) { - OutputType oValue = OutputType.parse(nnode.getChildNodes() - .item(0).getNodeValue()); - config.setConfigParameter(ConfigurationKeys.MODE_OUTPUT, oValue); + OutputType oValue = OutputType.parse(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_OUTPUT, oValue); - } else if (name.equals(KEY_PATH)) { + } + else if (name.equals(KEY_PATH)) { - String path = nnode.getChildNodes().item(0).getNodeValue(); - path = path.substring(1, path.length() - 1); + String path = nnode.getChildNodes().item(0).getNodeValue(); + path = path.substring(1, path.length() - 1); - config.setConfigParameter( - ConfigurationKeys.PATH_OUTPUT_SQL_FILES, path); + config.setConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES, path); - } else if (name.equals(KEY_OUTPUT_DATAFILE)) { - bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_DATAFILE_OUTPUT, bValue); - } else if (name.equals(KEY_LIMIT_SQL_FILE_SIZE)) { + } + else if (name.equals(KEY_OUTPUT_DATAFILE)) { + bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT, bValue); + } + else if (name.equals(KEY_LIMIT_SQL_FILE_SIZE)) { - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_SQL_FILE_SIZE, lValue); + lValue = Long.parseLong(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE, lValue); - } else if (name.equals(KEY_LIMIT_SQL_ARCHIVE_SIZE)) { + } + else if (name.equals(KEY_LIMIT_SQL_ARCHIVE_SIZE)) { - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE, lValue); + lValue = Long.parseLong(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE, lValue); - } else if (name.equals(KEY_MODE_ZIP_COMPRESSION_ENABLED)) { + } + else if (name.equals(KEY_MODE_ZIP_COMPRESSION_ENABLED)) { - bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, bValue); + bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, bValue); - } else if (name.equals(KEY_MODE_BINARY_OUTPUT_ENABLED)) { + } + else if (name.equals(KEY_MODE_BINARY_OUTPUT_ENABLED)) { - bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_BINARY_OUTPUT_ENABLED, bValue); + bValue = Boolean.parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_BINARY_OUTPUT_ENABLED, bValue); - } else if (name.equals(SUBSECTION_SQL)) { + } + else if (name.equals(SUBSECTION_SQL)) { - parseSQLConfig(nnode, config); + parseSQLConfig(nnode, config); - } + } + } } - } - /** - * Parses the sql parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseSQLConfig(final Node node, final ConfigSettings config) { + /** + * Parses the sql parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseSQLConfig(final Node node, final ConfigSettings config) + { - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_HOST)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_HOST)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_HOST, value); + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_HOST, value); - } else if (name.equals(KEY_DATABASE)) { + } + else if (name.equals(KEY_DATABASE)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_DATABASE, value); + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_DATABASE, value); - } else if (name.equals(KEY_USER)) { + } + else if (name.equals(KEY_USER)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_USERNAME, value); + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_USERNAME, value); - } else if (name.equals(KEY_PASSWORD)) { + } + else if (name.equals(KEY_PASSWORD)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - config.setConfigParameter(ConfigurationKeys.SQL_PASSWORD, value); - } + value = nnode.getChildNodes().item(0).getNodeValue(); + config.setConfigParameter(ConfigurationKeys.SQL_PASSWORD, value); + } + } } - } - /** - * Parses the cache parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseCacheConfig(final Node node, final ConfigSettings config) { + /** + * Parses the cache parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseCacheConfig(final Node node, final ConfigSettings config) + { - String name; - Long lValue; - Node nnode; - NodeList list = node.getChildNodes(); + String name; + Long lValue; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_LIMIT_TASK_SIZE_REVISIONS)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_LIMIT_TASK_SIZE_REVISIONS)) { - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, lValue); + lValue = Long.parseLong(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, lValue); - } else if (name.equals(KEY_LIMIT_TASK_SIZE_DIFFS)) { + } + else if (name.equals(KEY_LIMIT_TASK_SIZE_DIFFS)) { - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, lValue); + lValue = Long.parseLong(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, lValue); - } else if (name.equals(KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET)) { + } + else if (name.equals(KEY_LIMIT_SQLSERVER_MAX_ALLOWED_PACKET)) { - lValue = Long.parseLong(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, - lValue); + lValue = Long.parseLong(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, + lValue); - } + } + } } - } - /** - * Parses the logging parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseLoggingConfig(final Node node, final ConfigSettings config) { + /** + * Parses the logging parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseLoggingConfig(final Node node, final ConfigSettings config) + { - String name; - String value; - Node nnode; - NodeList list = node.getChildNodes(); + String name; + String value; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_ROOT_FOLDER)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_ROOT_FOLDER)) { - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); - config.setConfigParameter( - ConfigurationKeys.LOGGING_PATH_DIFFTOOL, value); + config.setConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL, value); - } else if (name.equals(SUBSUBSECTION_DIFF_TOOL)) { + } + else if (name.equals(SUBSUBSECTION_DIFF_TOOL)) { - parseLoggerConfig(nnode, config, null, - ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); + parseLoggerConfig(nnode, config, null, ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); - } + } + } } - } - - /** - * Parses the information for a logger. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - * @param logPath Key for the path of this logger. - * @param logLevel Key for the level of this logger. - */ - private void parseLoggerConfig(final Node node, - final ConfigSettings config, final ConfigurationKeys logPath, - final ConfigurationKeys logLevel) { - - String name, value; - Level level; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_LOG_PATH)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); - config.setConfigParameter(logPath, value); - - } else if (name.equals(KEY_LOG_LEVEL)) { - - level = Level.valueOf(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter(logLevel, level); - } + + /** + * Parses the information for a logger. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + * @param logPath + * Key for the path of this logger. + * @param logLevel + * Key for the level of this logger. + */ + private void parseLoggerConfig(final Node node, final ConfigSettings config, + final ConfigurationKeys logPath, final ConfigurationKeys logLevel) + { + + String name, value; + Level level; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_LOG_PATH)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); + config.setConfigParameter(logPath, value); + + } + else if (name.equals(KEY_LOG_LEVEL)) { + + level = Level.valueOf(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(logLevel, level); + } + } } - } - /** - * Parses the debug parameter section. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseDebugConfig(final Node node, final ConfigSettings config) { + /** + * Parses the debug parameter section. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseDebugConfig(final Node node, final ConfigSettings config) + { - String name; - Boolean value; - Node nnode; - NodeList list = node.getChildNodes(); + String name; + Boolean value; + Node nnode; + NodeList list = node.getChildNodes(); - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_VERIFICATION_DIFF)) { + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_VERIFICATION_DIFF)) { - value = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, - value); + value = Boolean.parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, value); - } else if (name.equals(KEY_VERIFICATION_ENCODING)) { + } + else if (name.equals(KEY_VERIFICATION_ENCODING)) { - value = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); + value = Boolean.parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.VERIFICATION_ENCODING, value); + config.setConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING, value); - } else if (name.equals(KEY_STATISTICAL_OUTPUT)) { + } + else if (name.equals(KEY_STATISTICAL_OUTPUT)) { - value = Boolean.parseBoolean(nnode.getChildNodes().item(0) - .getNodeValue()); - config.setConfigParameter( - ConfigurationKeys.MODE_STATISTICAL_OUTPUT, value); + value = Boolean.parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT, value); - } else if (name.equals(SUBSECTION_DEBUG_OUTPUT)) { + } + else if (name.equals(SUBSECTION_DEBUG_OUTPUT)) { - parseDebugOutputConfig(nnode, config); - } + parseDebugOutputConfig(nnode, config); + } + } } - } - - /** - * Parses the debug output parameter subsection. - * - * @param node Reference to the current used xml node - * @param config Reference to the ConfigSettings - */ - private void parseDebugOutputConfig(final Node node, - final ConfigSettings config) { - - String name, value; - Node nnode; - NodeList list = node.getChildNodes(); - - int length = list.getLength(); - for (int i = 0; i < length; i++) { - nnode = list.item(i); - - name = nnode.getNodeName().toUpperCase(); - if (name.equals(KEY_DEBUG_PATH)) { - - value = nnode.getChildNodes().item(0).getNodeValue(); - value = value.substring(1, value.length() - 1); - - config.setConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG, - value); - - } else if (name.equals(KEY_DEBUG_ENABLED)) { - Boolean enabled = Boolean.parseBoolean(nnode.getChildNodes() - .item(0).getNodeValue()); - config.setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, - enabled); - } + /** + * Parses the debug output parameter subsection. + * + * @param node + * Reference to the current used xml node + * @param config + * Reference to the ConfigSettings + */ + private void parseDebugOutputConfig(final Node node, final ConfigSettings config) + { + + String name, value; + Node nnode; + NodeList list = node.getChildNodes(); + + int length = list.getLength(); + for (int i = 0; i < length; i++) { + nnode = list.item(i); + + name = nnode.getNodeName().toUpperCase(); + if (name.equals(KEY_DEBUG_PATH)) { + + value = nnode.getChildNodes().item(0).getNodeValue(); + value = value.substring(1, value.length() - 1); + + config.setConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG, value); + + } + else if (name.equals(KEY_DEBUG_ENABLED)) { + + Boolean enabled = Boolean + .parseBoolean(nnode.getChildNodes().item(0).getNodeValue()); + config.setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, enabled); + } + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java index 516e90d4..ca09eab0 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/OutputTypes.java @@ -20,21 +20,22 @@ /** * This class represents the enumeration of OutputTypes of the IndexGenerator. */ -public enum OutputTypes { +public enum OutputTypes +{ - /** - * Output to the Database - */ - DATABASE, + /** + * Output to the Database + */ + DATABASE, - /** - * Output as single sql file. - */ - SQL, + /** + * Output as single sql file. + */ + SQL, - /** - * Output as datafile. - */ - DATAFILE + /** + * Output as datafile. + */ + DATAFILE } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java index 0a4174d6..8f0fe459 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigGUI.java @@ -26,65 +26,64 @@ import org.dkpro.jwpl.revisionmachine.difftool.config.gui.panels.ConfigPanel; /** - * This class represents the main class of the graphical configuration tool - * for the DiffTool. - *
+ * This class represents the main class of the graphical configuration tool for the DiffTool.
*

- * The GUI can be used to set all necessary configuration parameters for the - * DiffTool. However, it currently does not verify the validity of the - * combination of the settings. - * It only checks whether the individual setting contain valid values. - * Consequently, it is possible to produce configurations that won't - * work.

+ * The GUI can be used to set all necessary configuration parameters for the DiffTool. However, it + * currently does not verify the validity of the combination of the settings. It only checks whether + * the individual setting contain valid values. Consequently, it is possible to produce + * configurations that won't work. + *

*
* Example:
- * If the output mode is set to bzip2, it is currently not possible - * to split the output into several files. However, the ConfigGUI allows for - * this setting. + * If the output mode is set to bzip2, it is currently not possible to split the output into + * several files. However, the ConfigGUI allows for this setting. */ public class ConfigGUI - extends JFrame { + extends JFrame +{ - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 1L; - /** - * Reference to the ConfigController - */ - private final ConfigController controller; + /** + * Reference to the ConfigController + */ + private final ConfigController controller; - /** - * (Constructor) Creates a new ConfigGUI object. - */ - public ConfigGUI() { + /** + * (Constructor) Creates a new ConfigGUI object. + */ + public ConfigGUI() + { - this.controller = new ConfigController(); + this.controller = new ConfigController(); - this.setTitle("RevisionMachine DiffTool - Configuration"); + this.setTitle("RevisionMachine DiffTool - Configuration"); - setSize(600, 400); - setResizable(false); - setDefaultCloseOperation(EXIT_ON_CLOSE); + setSize(600, 400); + setResizable(false); + setDefaultCloseOperation(EXIT_ON_CLOSE); - Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); - setLocation((d.width - getSize().width) / 2, - (d.height - getSize().height) / 2); + Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); + setLocation((d.width - getSize().width) / 2, (d.height - getSize().height) / 2); - this.setJMenuBar(new ConfigMenuBar(controller)); - this.setContentPane(new ConfigPanel(controller)); + this.setJMenuBar(new ConfigMenuBar(controller)); + this.setContentPane(new ConfigPanel(controller)); - //load default parameters - this.controller.defaultConfiguration(); - } + // load default parameters + this.controller.defaultConfiguration(); + } - /** - * ConfigurationTool - Main Method - *

- * Starts the ConfigurationTool GUI - * - * @param args program arguments (not used) - */ - public static void main(final String[] args) { - new ConfigGUI().setVisible(true); - } + /** + * ConfigurationTool - Main Method + *

+ * Starts the ConfigurationTool GUI + * + * @param args + * program arguments (not used) + */ + public static void main(final String[] args) + { + new ConfigGUI().setVisible(true); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java index 6ca80e62..5c318406 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/ConfigMenuBar.java @@ -28,57 +28,60 @@ */ @SuppressWarnings("serial") public class ConfigMenuBar - extends JMenuBar { + extends JMenuBar +{ - /** - * Reference to the controller - */ - private final ConfigController controller; + /** + * Reference to the controller + */ + private final ConfigController controller; - /** - * (Constructor) Create the ConfigMenuBar object. - * - * @param controller reference to the controller - */ - public ConfigMenuBar(final ConfigController controller) { + /** + * (Constructor) Create the ConfigMenuBar object. + * + * @param controller + * reference to the controller + */ + public ConfigMenuBar(final ConfigController controller) + { - this.controller = controller; + this.controller = controller; - createSystemMenu(); - } + createSystemMenu(); + } - /** - * Creates the System menu and its menu items. - */ - private void createSystemMenu() { + /** + * Creates the System menu and its menu items. + */ + private void createSystemMenu() + { - JMenu system = new JMenu("System"); + JMenu system = new JMenu("System"); - JMenuItem importConfig = new JMenuItem("Import Configuration"); - importConfig.addActionListener(e -> controller.loadConfiguration()); + JMenuItem importConfig = new JMenuItem("Import Configuration"); + importConfig.addActionListener(e -> controller.loadConfiguration()); - system.add(importConfig); + system.add(importConfig); - JMenuItem exportConfig = new JMenuItem("Export Configuration"); - exportConfig.addActionListener(e -> controller.saveConfiguration()); + JMenuItem exportConfig = new JMenuItem("Export Configuration"); + exportConfig.addActionListener(e -> controller.saveConfiguration()); - system.add(exportConfig); + system.add(exportConfig); - system.addSeparator(); + system.addSeparator(); - JMenuItem defaultConfig = new JMenuItem( - "Reset to default parameters"); - defaultConfig.addActionListener(e -> controller.defaultConfiguration()); + JMenuItem defaultConfig = new JMenuItem("Reset to default parameters"); + defaultConfig.addActionListener(e -> controller.defaultConfiguration()); - system.add(defaultConfig); + system.add(defaultConfig); - system.addSeparator(); + system.addSeparator(); - JMenuItem systemClose = new JMenuItem("Close"); - systemClose.addActionListener(e -> System.exit(-1)); + JMenuItem systemClose = new JMenuItem("Close"); + systemClose.addActionListener(e -> System.exit(-1)); - system.add(systemClose); + system.add(systemClose); - this.add(system); - } + this.add(system); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java index 28e72549..2ff6df70 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ArchiveRegistry.java @@ -28,128 +28,145 @@ /** * ArchiveRegistry of the ConfigurationTool. *

- * Contains all input archives and represents the table model to display the - * archives in the InputPanel. + * Contains all input archives and represents the table model to display the archives in the + * InputPanel. */ @SuppressWarnings("serial") public class ArchiveRegistry - extends AbstractTableModel { - - /** - * List of input archives - */ - private final List archives; - - /** - * Name of columns - */ - private final String[] columnNames; - - /** - * (Constructor) Creates a new ArchiveRegistry. - */ - public ArchiveRegistry() { - this.columnNames = new String[]{"Input Type", "Start Position", - "Path"}; - this.archives = new ArrayList<>(); - } - - /** - * Returns the name of the specified column. - * - * @param col index of the column - * @return name of the column - */ - @Override - public String getColumnName(final int col) { - return this.columnNames[col]; - } - - /** - * Returns the number of columns. - */ - @Override - public int getColumnCount() { - return this.columnNames.length; - } - - /** - * Returns the number of rows. - */ - @Override - public int getRowCount() { - return this.archives.size(); - } - - /** - * Returns the value at the specified position. - * - * @param row index of the row - * @param col index of the column - * @return string representation of the specified field - */ - @Override - public Object getValueAt(final int row, final int col) { - - switch (col) { - case 0: - return archives.get(row).getType(); - case 1: - return archives.get(row).getStartPosition(); - case 2: - return archives.get(row).getPath(); + extends AbstractTableModel +{ + + /** + * List of input archives + */ + private final List archives; + + /** + * Name of columns + */ + private final String[] columnNames; + + /** + * (Constructor) Creates a new ArchiveRegistry. + */ + public ArchiveRegistry() + { + this.columnNames = new String[] { "Input Type", "Start Position", "Path" }; + this.archives = new ArrayList<>(); } - return "---"; - } - - /** - * Adds an archive description. - * - * @param description archive description - */ - public void addArchive(final ArchiveDescription description) { - this.archives.add(description); - } - - /** - * Removes an archive description. - * - * @param index index of the archive. - */ - public void removeArchive(final int index) { - this.archives.remove(index); - } - - /** - * Returns the archive at the specified position. - * - * @param index position - * @return ArchiveDescription - */ - public ArchiveDescription get(final int index) { - return this.archives.get(index); - } - - /** - * Deletes all contained archive descriptions. - */ - public void clear() { - this.archives.clear(); - } - - /** - * Adds the ArchiveDescriptions contained in the configuration. - * - * @param config Reference to the configuration - */ - public void applyConfiguration(final ConfigSettings config) { - - clear(); - - Iterator aIt = config.archiveIterator(); - while (aIt.hasNext()) { - addArchive(aIt.next()); + /** + * Returns the name of the specified column. + * + * @param col + * index of the column + * @return name of the column + */ + @Override + public String getColumnName(final int col) + { + return this.columnNames[col]; + } + + /** + * Returns the number of columns. + */ + @Override + public int getColumnCount() + { + return this.columnNames.length; + } + + /** + * Returns the number of rows. + */ + @Override + public int getRowCount() + { + return this.archives.size(); + } + + /** + * Returns the value at the specified position. + * + * @param row + * index of the row + * @param col + * index of the column + * @return string representation of the specified field + */ + @Override + public Object getValueAt(final int row, final int col) + { + + switch (col) { + case 0: + return archives.get(row).getType(); + case 1: + return archives.get(row).getStartPosition(); + case 2: + return archives.get(row).getPath(); + } + + return "---"; + } + + /** + * Adds an archive description. + * + * @param description + * archive description + */ + public void addArchive(final ArchiveDescription description) + { + this.archives.add(description); + } + + /** + * Removes an archive description. + * + * @param index + * index of the archive. + */ + public void removeArchive(final int index) + { + this.archives.remove(index); + } + + /** + * Returns the archive at the specified position. + * + * @param index + * position + * @return ArchiveDescription + */ + public ArchiveDescription get(final int index) + { + return this.archives.get(index); + } + + /** + * Deletes all contained archive descriptions. + */ + public void clear() + { + this.archives.clear(); + } + + /** + * Adds the ArchiveDescriptions contained in the configuration. + * + * @param config + * Reference to the configuration + */ + public void applyConfiguration(final ConfigSettings config) + { + + clear(); + + Iterator aIt = config.archiveIterator(); + while (aIt.hasNext()) { + addArchive(aIt.next()); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java index ad8441d6..62a460ea 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ComponentRegistry.java @@ -27,100 +27,112 @@ /** * ComponentsRegistry of the ConfigurationTool */ -public class ComponentRegistry { +public class ComponentRegistry +{ - /** - * Reference to the GUI - */ - private ConfigGUI gui; + /** + * Reference to the GUI + */ + private ConfigGUI gui; - /** - * Map that contains references to the important panels - */ - private final Map map; + /** + * Map that contains references to the important panels + */ + private final Map map; - /** - * (Constructor) Creates a ComponentRegistry. - */ - public ComponentRegistry() { - this.map = new HashMap<>(); - } + /** + * (Constructor) Creates a ComponentRegistry. + */ + public ComponentRegistry() + { + this.map = new HashMap<>(); + } - /** - * Registers the panel with the given key. - * - * @param key key - * @param panel panel - */ - public void register(final PanelKeys key, final AbstractPanel panel) { - this.map.put(key, panel); - } + /** + * Registers the panel with the given key. + * + * @param key + * key + * @param panel + * panel + */ + public void register(final PanelKeys key, final AbstractPanel panel) + { + this.map.put(key, panel); + } - /** - * Sets the reference of the GUI. - * - * @param gui GUI - */ - public void registerGUI(final ConfigGUI gui) { - this.gui = gui; - } + /** + * Sets the reference of the GUI. + * + * @param gui + * GUI + */ + public void registerGUI(final ConfigGUI gui) + { + this.gui = gui; + } - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { - map.get(PanelKeys.PANEL_VALUES).toXML(builder, errors); - map.get(PanelKeys.PANEL_EXTERNALS).toXML(builder, errors); - map.get(PanelKeys.PANEL_INPUT).toXML(builder, errors); - map.get(PanelKeys.PANEL_OUTPUT).toXML(builder, errors); - map.get(PanelKeys.PANEL_SQL).toXML(builder, errors); - map.get(PanelKeys.PANEL_CACHE).toXML(builder, errors); - map.get(PanelKeys.PANEL_LOGGING).toXML(builder, errors); - map.get(PanelKeys.PANEL_DEBUG).toXML(builder, errors); - map.get(PanelKeys.PANEL_FILTER).toXML(builder, errors); - } + map.get(PanelKeys.PANEL_VALUES).toXML(builder, errors); + map.get(PanelKeys.PANEL_EXTERNALS).toXML(builder, errors); + map.get(PanelKeys.PANEL_INPUT).toXML(builder, errors); + map.get(PanelKeys.PANEL_OUTPUT).toXML(builder, errors); + map.get(PanelKeys.PANEL_SQL).toXML(builder, errors); + map.get(PanelKeys.PANEL_CACHE).toXML(builder, errors); + map.get(PanelKeys.PANEL_LOGGING).toXML(builder, errors); + map.get(PanelKeys.PANEL_DEBUG).toXML(builder, errors); + map.get(PanelKeys.PANEL_FILTER).toXML(builder, errors); + } - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - public void applyConfig(final ConfigSettings config) { + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + public void applyConfig(final ConfigSettings config) + { - map.get(PanelKeys.PANEL_VALUES).applyConfig(config); - map.get(PanelKeys.PANEL_EXTERNALS).applyConfig(config); - map.get(PanelKeys.PANEL_INPUT).applyConfig(config); - map.get(PanelKeys.PANEL_OUTPUT).applyConfig(config); - map.get(PanelKeys.PANEL_SQL).applyConfig(config); - map.get(PanelKeys.PANEL_CACHE).applyConfig(config); - map.get(PanelKeys.PANEL_LOGGING).applyConfig(config); - map.get(PanelKeys.PANEL_DEBUG).applyConfig(config); - map.get(PanelKeys.PANEL_FILTER).applyConfig(config); - } + map.get(PanelKeys.PANEL_VALUES).applyConfig(config); + map.get(PanelKeys.PANEL_EXTERNALS).applyConfig(config); + map.get(PanelKeys.PANEL_INPUT).applyConfig(config); + map.get(PanelKeys.PANEL_OUTPUT).applyConfig(config); + map.get(PanelKeys.PANEL_SQL).applyConfig(config); + map.get(PanelKeys.PANEL_CACHE).applyConfig(config); + map.get(PanelKeys.PANEL_LOGGING).applyConfig(config); + map.get(PanelKeys.PANEL_DEBUG).applyConfig(config); + map.get(PanelKeys.PANEL_FILTER).applyConfig(config); + } - /** - * Returns the reference of the GUI. - * - * @return reference to the GUI - */ - public ConfigGUI getGUI() { - return this.gui; - } + /** + * Returns the reference of the GUI. + * + * @return reference to the GUI + */ + public ConfigGUI getGUI() + { + return this.gui; + } - /** - * Repaints the GUI. - */ - public void repaint() { - if (gui != null) { - gui.repaint(); + /** + * Repaints the GUI. + */ + public void repaint() + { + if (gui != null) { + gui.repaint(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java index 3a5a2747..c7a29d48 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigController.java @@ -33,550 +33,601 @@ /** * Controller of the ConfigurationTool */ -public class ConfigController { - - /** - * Reference to the ArchiveRegistry - */ - private final ArchiveRegistry archives; - - /** - * Reference to the ComponentRegistry - */ - private ComponentRegistry components; - - /** - * Reference to the configuration - */ - private final ConfigSettings config; - - /** - * Configuration settings - Flag that indicates whether the 7Zip support is - * enabled or not - */ - private boolean enable7Zip; - - /** - * Configuration settings - Flag that indicates whether debug output is - * enabled - */ - private boolean enableDebugOutput; - - /** - * Configuration settings - Flag that indicates whether diff verification is - * enabled - */ - private boolean enableDiffVerification; - - /** - * Configuration settings - Flag that indicates whether encoding - * verification is enabled - */ - private boolean enableEncodingVerification; - - /** - * Configuration settings - Flag that indicates whether the database output - * mode is enabled - */ - private boolean enableSQLDatabaseOutput; - - /** - * Configuration settings - Flag that indicates whether output should - * be a datafile instead of an sql dump - */ - private boolean enableDataFileOutput; - - /** - * Configuration settings - Flag that indicates whether statistical output - * is enabled - */ - private boolean enableStatsOutput; - - /** - * Configuration settings - Flag that indicates whether statistical output - * is enabled - */ - private boolean enableZipCompression; - - /** - * Reference to the ConfigVerification - */ - private ConfigVerification errors; - - /** - * Configuration settings - Flag that indicates whether multiple output - * files are allowed - */ - private boolean multipleOutputFiles; - - /** - * Configuration settings - Output compression mode - */ - private OutputCompressionEnum outputCompression; - - /** - * Configuration settings - Output file limit - */ - private long outputFileLimit; - - /** - * Configuration settings - Surrogate Mode - */ - private SurrogateModes surrogates; - - /** - * XML Representation of the content - */ - private StringBuilder xmlConfig; - - /** - * (Constructor) Creates a new ConfigController. - */ - public ConfigController() { - - this.components = new ComponentRegistry(); - this.archives = new ArchiveRegistry(); - - this.config = new ConfigSettings(); - - this.enable7Zip = false; - - this.outputFileLimit = -1; - this.multipleOutputFiles = false; - this.outputCompression = OutputCompressionEnum.None; - - this.enableZipCompression = true; - this.enableDebugOutput = false; - this.enableSQLDatabaseOutput = false; - - this.surrogates = SurrogateModes.DISCARD_REVISION; - - } - - /** - * Adds an archive to the archive registry. - * - * @param archive reference to the archive - */ - public void addArchive(final ArchiveDescription archive) { - this.archives.addArchive(archive); - } - - /** - * Applies the configuration file. - *

- * The input settings will be ignored if a default configuration was used. - */ - private void applyConfig() { - this.components.applyConfig(config); - - switch (config.getConfigType()) { - case DEFAULT: - break; - case IMPORT: - this.archives.applyConfiguration(config); - } - - repaint(); - } - - /** - * Creates the xml content representation of the currently used settings. - * - * @return TRUE if the ConfigVerfication contains no items, FALSE otherwise - */ - public boolean createConfigurationXML() { - - errors = new ConfigVerification(); - xmlConfig = new StringBuilder(); - - xmlConfig.append("\r\n"); - components.toXML(xmlConfig, errors); - xmlConfig.append("\r\n"); - - if (errors.getRowCount() != 0) { - - // TODO: invoke the dialog at another place - new ConfigDialog(this).setVisible(true); - - return false; - } - - return true; - } - - /** - * Applies the default parameter to the currently loaded config - */ - public void defaultConfiguration() { - config.defaultConfiguration(); - applyConfig(); - } - - /** - * Returns the reference to the ArchiveRegistry. - * - * @return archive registry - */ - public ArchiveRegistry getArchives() { - return archives; - } - - /** - * Return the reference to the ConfigVerifactions. - * - * @return ConfigVerification - */ - public ConfigVerification getConfigErrors() { - return errors; - } - - /** - * Returns the output compression mode. - * - * @return output compression mode - */ - public OutputCompressionEnum getOutputCompression() { - return outputCompression; - } - - /** - * Returns the maximum size of an output file. - * - * @return maximum size of an output file. - */ - public long getOutputFileLimit() { - return outputFileLimit; - } - - /** - * Returns the reference to the component registry. - * - * @return component registry - */ - public ComponentRegistry getRegistry() { - return components; - } - - /** - * Returns the surrogate mode. - * - * @return surrogate mode - */ - public SurrogateModes getSurrogates() { - return surrogates; - } - - /** - * Returns whether the 7Zip support is enabled or not. - * - * @return TRUE | FALSE - */ - public boolean is7ZipEnabled() { - return enable7Zip; - } - - /** - * Returns whether the debug output is enabled. - * - * @return debug output flag - */ - public boolean isDebugOutputEnabled() { - return enableDebugOutput; - } - - /** - * Returns whether the diff verification mode is enabled. - * - * @return diff verification flag - */ - public boolean isDiffVerificationEnabled() { - return enableDiffVerification; - } - - /** - * Returns whether the database output mode is enabled. - * - * @return database output flag - */ - public boolean isEnableSQLDatabaseOutput() { - return enableSQLDatabaseOutput; - } - - /** - * Returns whether the encoding verification mode is enabled. - * - * @return encoding verification flag - */ - public boolean isEncodingVerificationEnabled() { - return enableEncodingVerification; - } - - /** - * Returns whether multiple output files should be used. - * - * @return multiple output files flag - */ - public boolean isMultipleOutputFiles() { - return multipleOutputFiles; - } - - /** - * Returns whether the statistical output mode is enabled. - * - * @return statistical output flag - */ - public boolean isStatsOutputEnabled() { - return enableStatsOutput; - } - - /** - * Returns whether the Zip-Compression is enabled or not. - * - * @return Zip-Compression flag - */ - public boolean isZipCompressionEnabled() { - return enableZipCompression; - } - - /** - * Loads the configuration from the specified file - * - * @param path input file - */ - public void loadConfig(final String path) { - config.loadConfig(path); - applyConfig(); - } - - /** - * Loads the configuration file. The path of the file will be chosen by - * displaying a FileChooser Dialog. - */ - public void loadConfiguration() { - - XMLFileChooser fc = new XMLFileChooser(); - if (fc.showOpenDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { - this.loadConfig(fc.getSelectedFile().getPath()); - } - } - - /** - * Registers the panel with the given key. - * - * @param key key - * @param panel panel - */ - public void register(final PanelKeys key, final AbstractPanel panel) { - this.components.register(key, panel); - } - - /** - * Removes the specified archive from the archive registry. - * - * @param index index of the archive - */ - public void removeArchive(final int index) { - this.archives.removeArchive(index); - } - - /** - * Repaints the GUI. - */ - public void repaint() { - this.components.repaint(); - } - - /** - * Saves the configuration file. The path of the file will be chosen by - * displaying a FileChooser Dialog. - */ - public void saveConfiguration() { - - if (this.createConfigurationXML()) { - - XMLFileChooser fc = new XMLFileChooser(); - if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { - - String path = fc.getSelectedFile().getPath(); - if (path.indexOf('.') == -1) { - path += ".xml"; +public class ConfigController +{ + + /** + * Reference to the ArchiveRegistry + */ + private final ArchiveRegistry archives; + + /** + * Reference to the ComponentRegistry + */ + private ComponentRegistry components; + + /** + * Reference to the configuration + */ + private final ConfigSettings config; + + /** + * Configuration settings - Flag that indicates whether the 7Zip support is enabled or not + */ + private boolean enable7Zip; + + /** + * Configuration settings - Flag that indicates whether debug output is enabled + */ + private boolean enableDebugOutput; + + /** + * Configuration settings - Flag that indicates whether diff verification is enabled + */ + private boolean enableDiffVerification; + + /** + * Configuration settings - Flag that indicates whether encoding verification is enabled + */ + private boolean enableEncodingVerification; + + /** + * Configuration settings - Flag that indicates whether the database output mode is enabled + */ + private boolean enableSQLDatabaseOutput; + + /** + * Configuration settings - Flag that indicates whether output should be a datafile instead of + * an sql dump + */ + private boolean enableDataFileOutput; + + /** + * Configuration settings - Flag that indicates whether statistical output is enabled + */ + private boolean enableStatsOutput; + + /** + * Configuration settings - Flag that indicates whether statistical output is enabled + */ + private boolean enableZipCompression; + + /** + * Reference to the ConfigVerification + */ + private ConfigVerification errors; + + /** + * Configuration settings - Flag that indicates whether multiple output files are allowed + */ + private boolean multipleOutputFiles; + + /** + * Configuration settings - Output compression mode + */ + private OutputCompressionEnum outputCompression; + + /** + * Configuration settings - Output file limit + */ + private long outputFileLimit; + + /** + * Configuration settings - Surrogate Mode + */ + private SurrogateModes surrogates; + + /** + * XML Representation of the content + */ + private StringBuilder xmlConfig; + + /** + * (Constructor) Creates a new ConfigController. + */ + public ConfigController() + { + + this.components = new ComponentRegistry(); + this.archives = new ArchiveRegistry(); + + this.config = new ConfigSettings(); + + this.enable7Zip = false; + + this.outputFileLimit = -1; + this.multipleOutputFiles = false; + this.outputCompression = OutputCompressionEnum.None; + + this.enableZipCompression = true; + this.enableDebugOutput = false; + this.enableSQLDatabaseOutput = false; + + this.surrogates = SurrogateModes.DISCARD_REVISION; + + } + + /** + * Adds an archive to the archive registry. + * + * @param archive + * reference to the archive + */ + public void addArchive(final ArchiveDescription archive) + { + this.archives.addArchive(archive); + } + + /** + * Applies the configuration file. + *

+ * The input settings will be ignored if a default configuration was used. + */ + private void applyConfig() + { + this.components.applyConfig(config); + + switch (config.getConfigType()) { + case DEFAULT: + break; + case IMPORT: + this.archives.applyConfiguration(config); + } + + repaint(); + } + + /** + * Creates the xml content representation of the currently used settings. + * + * @return TRUE if the ConfigVerfication contains no items, FALSE otherwise + */ + public boolean createConfigurationXML() + { + + errors = new ConfigVerification(); + xmlConfig = new StringBuilder(); + + xmlConfig.append("\r\n"); + components.toXML(xmlConfig, errors); + xmlConfig.append("\r\n"); + + if (errors.getRowCount() != 0) { + + // TODO: invoke the dialog at another place + new ConfigDialog(this).setVisible(true); + + return false; + } + + return true; + } + + /** + * Applies the default parameter to the currently loaded config + */ + public void defaultConfiguration() + { + config.defaultConfiguration(); + applyConfig(); + } + + /** + * Returns the reference to the ArchiveRegistry. + * + * @return archive registry + */ + public ArchiveRegistry getArchives() + { + return archives; + } + + /** + * Return the reference to the ConfigVerifactions. + * + * @return ConfigVerification + */ + public ConfigVerification getConfigErrors() + { + return errors; + } + + /** + * Returns the output compression mode. + * + * @return output compression mode + */ + public OutputCompressionEnum getOutputCompression() + { + return outputCompression; + } + + /** + * Returns the maximum size of an output file. + * + * @return maximum size of an output file. + */ + public long getOutputFileLimit() + { + return outputFileLimit; + } + + /** + * Returns the reference to the component registry. + * + * @return component registry + */ + public ComponentRegistry getRegistry() + { + return components; + } + + /** + * Returns the surrogate mode. + * + * @return surrogate mode + */ + public SurrogateModes getSurrogates() + { + return surrogates; + } + + /** + * Returns whether the 7Zip support is enabled or not. + * + * @return TRUE | FALSE + */ + public boolean is7ZipEnabled() + { + return enable7Zip; + } + + /** + * Returns whether the debug output is enabled. + * + * @return debug output flag + */ + public boolean isDebugOutputEnabled() + { + return enableDebugOutput; + } + + /** + * Returns whether the diff verification mode is enabled. + * + * @return diff verification flag + */ + public boolean isDiffVerificationEnabled() + { + return enableDiffVerification; + } + + /** + * Returns whether the database output mode is enabled. + * + * @return database output flag + */ + public boolean isEnableSQLDatabaseOutput() + { + return enableSQLDatabaseOutput; + } + + /** + * Returns whether the encoding verification mode is enabled. + * + * @return encoding verification flag + */ + public boolean isEncodingVerificationEnabled() + { + return enableEncodingVerification; + } + + /** + * Returns whether multiple output files should be used. + * + * @return multiple output files flag + */ + public boolean isMultipleOutputFiles() + { + return multipleOutputFiles; + } + + /** + * Returns whether the statistical output mode is enabled. + * + * @return statistical output flag + */ + public boolean isStatsOutputEnabled() + { + return enableStatsOutput; + } + + /** + * Returns whether the Zip-Compression is enabled or not. + * + * @return Zip-Compression flag + */ + public boolean isZipCompressionEnabled() + { + return enableZipCompression; + } + + /** + * Loads the configuration from the specified file + * + * @param path + * input file + */ + public void loadConfig(final String path) + { + config.loadConfig(path); + applyConfig(); + } + + /** + * Loads the configuration file. The path of the file will be chosen by displaying a FileChooser + * Dialog. + */ + public void loadConfiguration() + { + + XMLFileChooser fc = new XMLFileChooser(); + if (fc.showOpenDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { + this.loadConfig(fc.getSelectedFile().getPath()); } + } + + /** + * Registers the panel with the given key. + * + * @param key + * key + * @param panel + * panel + */ + public void register(final PanelKeys key, final AbstractPanel panel) + { + this.components.register(key, panel); + } + + /** + * Removes the specified archive from the archive registry. + * + * @param index + * index of the archive + */ + public void removeArchive(final int index) + { + this.archives.removeArchive(index); + } + + /** + * Repaints the GUI. + */ + public void repaint() + { + this.components.repaint(); + } + + /** + * Saves the configuration file. The path of the file will be chosen by displaying a FileChooser + * Dialog. + */ + public void saveConfiguration() + { + + if (this.createConfigurationXML()) { - if (this.saveConfiguration(path)) { - System.out.println("SAVE CONFIG SUCCESSFULL"); - } else { + XMLFileChooser fc = new XMLFileChooser(); + if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { - System.out.println("SAVE CONFIG FAILED"); + String path = fc.getSelectedFile().getPath(); + if (path.indexOf('.') == -1) { + path += ".xml"; + } + + if (this.saveConfiguration(path)) { + System.out.println("SAVE CONFIG SUCCESSFULL"); + } + else { + + System.out.println("SAVE CONFIG FAILED"); + } + } + + } + } + + /** + * Save the configuration to a file. + * + * @param path + * output path + * @return TRUE if the configuration was succesfully exported FALSE otherwise + */ + public boolean saveConfiguration(final String path) + { + + if (xmlConfig != null && !errors.hasFailed()) { + + boolean success = true; + + FileWriter writer = null; + try { + writer = new FileWriter(path); + writer.write(xmlConfig.toString()); + writer.flush(); + + } + catch (IOException ioe) { + ioe.printStackTrace(); + success = false; + } + finally { + if (writer != null) { + try { + writer.close(); + } + catch (IOException ioe) { + success = false; + } + } + } + + return success; } - } - - } - } - - /** - * Save the configuration to a file. - * - * @param path output path - * @return TRUE if the configuration was succesfully exported FALSE - * otherwise - */ - public boolean saveConfiguration(final String path) { - - if (xmlConfig != null && !errors.hasFailed()) { - - boolean success = true; - - FileWriter writer = null; - try { - writer = new FileWriter(path); - writer.write(xmlConfig.toString()); - writer.flush(); - - } catch (IOException ioe) { - ioe.printStackTrace(); - success = false; - } finally { - if (writer != null) { - try { - writer.close(); - } catch (IOException ioe) { - success = false; - } + + return false; + } + + /** + * Enables or disables the 7Zip support. + *

+ * If the support is disabled the and the OutputCompression Mode was 7Zip the Mode will be + * reseted to None. + * + * @param enable7Zip + * 7Zip support flag + */ + public void setEnable7Zip(final boolean enable7Zip) + { + this.enable7Zip = enable7Zip; + if (!this.enable7Zip) { + if (outputCompression == OutputCompressionEnum.SevenZip) { + outputCompression = OutputCompressionEnum.None; + } } - } - - return success; - } - - return false; - } - - - /** - * Enables or disables the 7Zip support. - *

- * If the support is disabled the and the OutputCompression Mode was 7Zip - * the Mode will be reseted to None. - * - * @param enable7Zip 7Zip support flag - */ - public void setEnable7Zip(final boolean enable7Zip) { - this.enable7Zip = enable7Zip; - if (!this.enable7Zip) { - if (outputCompression == OutputCompressionEnum.SevenZip) { - outputCompression = OutputCompressionEnum.None; - } - } - } - - /** - * Sets the debug output mode. - * - * @param enableDebugOutput debug output flag - */ - public void setEnableDebugOutput(final boolean enableDebugOutput) { - this.enableDebugOutput = enableDebugOutput; - } - - /** - * Sets the diff verification mode. - * - * @param enableDiffVerification diff verification mode - */ - public void setEnableDiffVerification(final boolean enableDiffVerification) { - this.enableDiffVerification = enableDiffVerification; - } - - /** - * Sets the encoding verification mode. - * - * @param enableEncodingVerification diff verification mode - */ - public void setEnableEncodingVerification( - final boolean enableEncodingVerification) { - this.enableEncodingVerification = enableEncodingVerification; - } - - /** - * Sets the database output flag. - * - * @param enableSQLDatabaseOutput database output flag - */ - public void setEnableSQLDatabaseOutput(final boolean enableSQLDatabaseOutput) { - this.enableSQLDatabaseOutput = enableSQLDatabaseOutput; - } - - /** - * Sets the statistical output mode. - * - * @param enableStatsOutput statistical output flag - */ - public void setEnableStatsOutput(final boolean enableStatsOutput) { - this.enableStatsOutput = enableStatsOutput; - } - - /** - * Sets the Zip-Compression mode. - * - * @param enableZipCompression Zip-Compression flag - */ - public void setEnableZipCompression(final boolean enableZipCompression) { - this.enableZipCompression = enableZipCompression; - } - - /** - * Sets whether multiple output files should be used. - * - * @param multipleOutputFiles multiple output files flag - */ - public void setMultipleOutputFiles(final boolean multipleOutputFiles) { - this.multipleOutputFiles = multipleOutputFiles; - } - - /** - * Sets the output compression mode. - * - * @param outputCompression output compression mode - */ - public void setOutputCompression( - final OutputCompressionEnum outputCompression) { - this.outputCompression = outputCompression; - } - - /** - * Sets the maximum size of an output file. - * - * @param outputFileLimit maximum size of an output file - */ - public void setOutputFileLimit(final long outputFileLimit) { - this.outputFileLimit = outputFileLimit; - } - - /** - * Sets the reference to the component registry. - * - * @param registry component registry - */ - public void setRegistry(final ComponentRegistry registry) { - this.components = registry; - } - - /** - * Sets the surrogate mode. - * - * @param surrogates surrogate mode - */ - public void setSurrogates(final SurrogateModes surrogates) { - this.surrogates = surrogates; - } - - public boolean isEnableDataFileOutput() { - return enableDataFileOutput; - } - - public void setEnableDataFileOutput(boolean enableDataFileOutput) { - this.enableDataFileOutput = enableDataFileOutput; - } + } + + /** + * Sets the debug output mode. + * + * @param enableDebugOutput + * debug output flag + */ + public void setEnableDebugOutput(final boolean enableDebugOutput) + { + this.enableDebugOutput = enableDebugOutput; + } + + /** + * Sets the diff verification mode. + * + * @param enableDiffVerification + * diff verification mode + */ + public void setEnableDiffVerification(final boolean enableDiffVerification) + { + this.enableDiffVerification = enableDiffVerification; + } + + /** + * Sets the encoding verification mode. + * + * @param enableEncodingVerification + * diff verification mode + */ + public void setEnableEncodingVerification(final boolean enableEncodingVerification) + { + this.enableEncodingVerification = enableEncodingVerification; + } + + /** + * Sets the database output flag. + * + * @param enableSQLDatabaseOutput + * database output flag + */ + public void setEnableSQLDatabaseOutput(final boolean enableSQLDatabaseOutput) + { + this.enableSQLDatabaseOutput = enableSQLDatabaseOutput; + } + + /** + * Sets the statistical output mode. + * + * @param enableStatsOutput + * statistical output flag + */ + public void setEnableStatsOutput(final boolean enableStatsOutput) + { + this.enableStatsOutput = enableStatsOutput; + } + + /** + * Sets the Zip-Compression mode. + * + * @param enableZipCompression + * Zip-Compression flag + */ + public void setEnableZipCompression(final boolean enableZipCompression) + { + this.enableZipCompression = enableZipCompression; + } + + /** + * Sets whether multiple output files should be used. + * + * @param multipleOutputFiles + * multiple output files flag + */ + public void setMultipleOutputFiles(final boolean multipleOutputFiles) + { + this.multipleOutputFiles = multipleOutputFiles; + } + + /** + * Sets the output compression mode. + * + * @param outputCompression + * output compression mode + */ + public void setOutputCompression(final OutputCompressionEnum outputCompression) + { + this.outputCompression = outputCompression; + } + + /** + * Sets the maximum size of an output file. + * + * @param outputFileLimit + * maximum size of an output file + */ + public void setOutputFileLimit(final long outputFileLimit) + { + this.outputFileLimit = outputFileLimit; + } + + /** + * Sets the reference to the component registry. + * + * @param registry + * component registry + */ + public void setRegistry(final ComponentRegistry registry) + { + this.components = registry; + } + + /** + * Sets the surrogate mode. + * + * @param surrogates + * surrogate mode + */ + public void setSurrogates(final SurrogateModes surrogates) + { + this.surrogates = surrogates; + } + + public boolean isEnableDataFileOutput() + { + return enableDataFileOutput; + } + + public void setEnableDataFileOutput(boolean enableDataFileOutput) + { + this.enableDataFileOutput = enableDataFileOutput; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java index 036ae061..9e35b70c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigSettings.java @@ -38,214 +38,232 @@ /** * This class contain all configuration parameters. */ -public class ConfigSettings { - - /** - * Returns the type of the configuration - */ - private ConfigEnum type; - - /** - * List of input archives - */ - private List archives; - - /** - * Map that contains the configuration parameters and values - */ - private Map parameterMap; - - /** - * Creates an empty {@link ConfigSettings} object of unspecified - * type. - */ - public ConfigSettings() { - this.parameterMap = new HashMap<>(); - this.archives = new ArrayList<>(); - } - - /** - * Creates an empty {@link ConfigSettings} object of given type. - * - * @param type Configuration Type - */ - public ConfigSettings(final ConfigEnum type) { - this.type = type; - this.parameterMap = new HashMap<>(); - this.archives = new ArrayList<>(); - } - - /** - * Adds an input archive description object to the input archive list. - * - * @param archive ArchiveDescription - */ - public void add(final ArchiveDescription archive) { - this.archives.add(archive); - } - - /** - * Returns the input archive at the specified position. - * - * @param index position - * @return input archive description - */ - public ArchiveDescription getArchiveDescription(int index) { - return this.archives.get(index); - } - - /** - * Returns the list of input archives. - * - * @return list of the input archive descriptions - */ - public List getArchiveList() { - return this.archives; - } - - /** - * Returns the number of input archives. - * - * @return size of the input archive list - */ - public int archiveSize() { - return this.archives.size(); - } - - /** - * Returns an iterator over the input archive list. - * - * @return Iterator - */ - public Iterator archiveIterator() { - return this.archives.iterator(); - } - - /** - * Assigns the given value to the the given key. - * - * @param key configuration key - * @param value value - */ - public void setConfigParameter(final ConfigurationKeys key, Object value) { - // before setting parameter, check if paths have trailing File.separator - if (key == ConfigurationKeys.LOGGING_PATH_DEBUG - || key == ConfigurationKeys.LOGGING_PATH_DIFFTOOL - || key == ConfigurationKeys.PATH_OUTPUT_SQL_FILES) { - - String v = (String) value; - // if we do not have a trailing file separator and the current - // path is compatible to the system that is running the config tool, - // then add a trailing separator - if (!v.endsWith(File.separator) && v.contains(File.separator)) { - value = v + File.separator; - } +public class ConfigSettings +{ + + /** + * Returns the type of the configuration + */ + private ConfigEnum type; + + /** + * List of input archives + */ + private List archives; + + /** + * Map that contains the configuration parameters and values + */ + private Map parameterMap; + + /** + * Creates an empty {@link ConfigSettings} object of unspecified type. + */ + public ConfigSettings() + { + this.parameterMap = new HashMap<>(); + this.archives = new ArrayList<>(); } - this.parameterMap.put(key, value); - } - - /** - * Returns the value related to the configuration key or null if the key is - * not contained. - * - * @param configParameter configuration key - * @return value or null - */ - public Object getConfigParameter(final ConfigurationKeys configParameter) { - if (this.parameterMap.containsKey(configParameter)) { - return this.parameterMap.get(configParameter); + /** + * Creates an empty {@link ConfigSettings} object of given type. + * + * @param type + * Configuration Type + */ + public ConfigSettings(final ConfigEnum type) + { + this.type = type; + this.parameterMap = new HashMap<>(); + this.archives = new ArrayList<>(); } - return null; - } + /** + * Adds an input archive description object to the input archive list. + * + * @param archive + * ArchiveDescription + */ + public void add(final ArchiveDescription archive) + { + this.archives.add(archive); + } - /** - * Applies the default single thread configuration of the DiffTool to this - * settings. - */ - public void defaultConfiguration() { - clear(); + /** + * Returns the input archive at the specified position. + * + * @param index + * position + * @return input archive description + */ + public ArchiveDescription getArchiveDescription(int index) + { + return this.archives.get(index); + } - setConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, 12); + /** + * Returns the list of input archives. + * + * @return list of the input archive descriptions + */ + public List getArchiveList() + { + return this.archives; + } - setConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION, 1000); + /** + * Returns the number of input archives. + * + * @return size of the input archive list + */ + public int archiveSize() + { + return this.archives.size(); + } - setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, 5000000L); + /** + * Returns an iterator over the input archive list. + * + * @return Iterator + */ + public Iterator archiveIterator() + { + return this.archives.iterator(); + } - setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, 1000000L); + /** + * Assigns the given value to the the given key. + * + * @param key + * configuration key + * @param value + * value + */ + public void setConfigParameter(final ConfigurationKeys key, Object value) + { + // before setting parameter, check if paths have trailing File.separator + if (key == ConfigurationKeys.LOGGING_PATH_DEBUG + || key == ConfigurationKeys.LOGGING_PATH_DIFFTOOL + || key == ConfigurationKeys.PATH_OUTPUT_SQL_FILES) { + + String v = (String) value; + // if we do not have a trailing file separator and the current + // path is compatible to the system that is running the config tool, + // then add a trailing separator + if (!v.endsWith(File.separator) && v.contains(File.separator)) { + value = v + File.separator; + } + } + + this.parameterMap.put(key, value); + } - setConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, 1000000L); + /** + * Returns the value related to the configuration key or null if the key is not contained. + * + * @param configParameter + * configuration key + * @return value or null + */ + public Object getConfigParameter(final ConfigurationKeys configParameter) + { + if (this.parameterMap.containsKey(configParameter)) { + return this.parameterMap.get(configParameter); + } + + return null; + } - setConfigParameter(ConfigurationKeys.MODE_SURROGATES, SurrogateModes.DISCARD_REVISION); + /** + * Applies the default single thread configuration of the DiffTool to this settings. + */ + public void defaultConfiguration() + { + clear(); - setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, StandardCharsets.UTF_8.toString()); + setConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING, 12); - setConfigParameter(ConfigurationKeys.MODE_OUTPUT, OutputType.BZIP2); + setConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION, 1000); - setConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT, false); + setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS, 5000000L); - setConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, true); + setConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS, 1000000L); - setConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE, 1000000000L); + setConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET, 1000000L); - setConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL, "logs"); + setConfigParameter(ConfigurationKeys.MODE_SURROGATES, SurrogateModes.DISCARD_REVISION); - setConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL, Level.INFO); + setConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING, StandardCharsets.UTF_8.toString()); - setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, false); + setConfigParameter(ConfigurationKeys.MODE_OUTPUT, OutputType.BZIP2); - setConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING, false); + setConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT, false); - setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, false); + setConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED, true); - setConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT, false); + setConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE, 1000000000L); - Set defaultNamespaces = new HashSet<>(); - defaultNamespaces.add(0); - defaultNamespaces.add(1); - setConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP, defaultNamespaces); + setConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL, "logs"); - this.type = ConfigEnum.DEFAULT; - } + setConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL, Level.INFO); + setConfigParameter(ConfigurationKeys.VERIFICATION_DIFF, false); - /** - * Deletes all contained input archives and configuration parameter. - */ - public void clear() { - this.parameterMap.clear(); - this.archives.clear(); - } + setConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING, false); - /** - * Returns the configuration type. - * - * @return configuration type - */ - public ConfigEnum getConfigType() { - return this.type; - } + setConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT, false); - /** - * Loads the configuration settings from a file. - * - * @param path path to the configuration file - */ - public void loadConfig(final String path) { - try { + setConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT, false); - ConfigurationReader reader = new ConfigurationReader(path); - ConfigSettings settings = reader.read(); + Set defaultNamespaces = new HashSet<>(); + defaultNamespaces.add(0); + defaultNamespaces.add(1); + setConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP, defaultNamespaces); - clear(); + this.type = ConfigEnum.DEFAULT; + } - this.type = settings.type; - this.parameterMap = settings.parameterMap; - this.archives = settings.archives; + /** + * Deletes all contained input archives and configuration parameter. + */ + public void clear() + { + this.parameterMap.clear(); + this.archives.clear(); + } + + /** + * Returns the configuration type. + * + * @return configuration type + */ + public ConfigEnum getConfigType() + { + return this.type; + } - } catch (Exception e) { - e.printStackTrace(); + /** + * Loads the configuration settings from a file. + * + * @param path + * path to the configuration file + */ + public void loadConfig(final String path) + { + try { + + ConfigurationReader reader = new ConfigurationReader(path); + ConfigSettings settings = reader.read(); + + clear(); + + this.type = settings.type; + this.parameterMap = settings.parameterMap; + this.archives = settings.archives; + + } + catch (Exception e) { + e.printStackTrace(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java index 74ef502e..b5502173 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/control/ConfigVerification.java @@ -26,106 +26,114 @@ import org.dkpro.jwpl.revisionmachine.difftool.config.gui.data.ConfigItemTypes; /** - * This class contains the list of error or warning messages that have been - * generated during the verification of the configuration settings. + * This class contains the list of error or warning messages that have been generated during the + * verification of the configuration settings. */ @SuppressWarnings("serial") public class ConfigVerification - extends AbstractTableModel { - - /** - * If an error message was added to the list. - */ - private boolean failed; - - /** - * List of configuration items - */ - private final List list; - - /** - * Column names of the table representation - */ - private final String[] columnNames; - - /** - * (Constructor) Creates an empty ConfigVerification object. - */ - public ConfigVerification() { - this.list = new ArrayList<>(); - this.failed = false; - - this.columnNames = new String[]{"Type", "Error", "Message"}; - } - - /** - * Adds a configuration item to the list. - * - * @param item configuration item - */ - public void add(final ConfigItem item) { - failed = failed || item.getType() == ConfigItemTypes.ERROR; - this.list.add(item); - } - - /** - * Returns the name of the column with the index col. - * - * @return column name of the specified column. - */ - @Override - public String getColumnName(final int col) { - return this.columnNames[col]; - } - - /** - * Returns the number of columns. - * - * @return number of columns - */ - @Override - public int getColumnCount() { - return 3; - } - - /** - * Returns the number of rows. - * - * @return number of rows - */ - @Override - public int getRowCount() { - return list.size(); - } - - /** - * Returns the value at the specified column of the specified row. - * - * @return value - */ - @Override - public Object getValueAt(final int row, final int column) { - - ConfigItem item = this.list.get(row); - - switch (column) { - case 0: - return item.getType(); - case 1: - return item.getKey(); - case 2: - return item.getMessage(); + extends AbstractTableModel +{ + + /** + * If an error message was added to the list. + */ + private boolean failed; + + /** + * List of configuration items + */ + private final List list; + + /** + * Column names of the table representation + */ + private final String[] columnNames; + + /** + * (Constructor) Creates an empty ConfigVerification object. + */ + public ConfigVerification() + { + this.list = new ArrayList<>(); + this.failed = false; + + this.columnNames = new String[] { "Type", "Error", "Message" }; + } + + /** + * Adds a configuration item to the list. + * + * @param item + * configuration item + */ + public void add(final ConfigItem item) + { + failed = failed || item.getType() == ConfigItemTypes.ERROR; + this.list.add(item); + } + + /** + * Returns the name of the column with the index col. + * + * @return column name of the specified column. + */ + @Override + public String getColumnName(final int col) + { + return this.columnNames[col]; + } + + /** + * Returns the number of columns. + * + * @return number of columns + */ + @Override + public int getColumnCount() + { + return 3; + } + + /** + * Returns the number of rows. + * + * @return number of rows + */ + @Override + public int getRowCount() + { + return list.size(); + } + + /** + * Returns the value at the specified column of the specified row. + * + * @return value + */ + @Override + public Object getValueAt(final int row, final int column) + { + + ConfigItem item = this.list.get(row); + + switch (column) { + case 0: + return item.getType(); + case 1: + return item.getKey(); + case 2: + return item.getMessage(); + } + return null; + } + + /** + * Returns whether the configuration item list contains an error message or not. + * + * @return TRUE | FALSE + */ + public boolean hasFailed() + { + return this.failed; } - return null; - } - - /** - * Returns whether the configuration item list contains an error message or - * not. - * - * @return TRUE | FALSE - */ - public boolean hasFailed() { - return this.failed; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java index 2277ca0f..af95e07c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigEnum.java @@ -20,15 +20,16 @@ /** * Contains the keys for the configuration types. */ -public enum ConfigEnum { +public enum ConfigEnum +{ - /** - * Default Configuration - */ - DEFAULT, + /** + * Default Configuration + */ + DEFAULT, - /** - * Imported Configuration - */ - IMPORT + /** + * Imported Configuration + */ + IMPORT } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java index f171b332..4908664f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigErrorKeys.java @@ -20,35 +20,36 @@ /** * Contains the keys for the configuration verification error types. */ -public enum ConfigErrorKeys { +public enum ConfigErrorKeys +{ - /** - * Mode was enabled, but no value was set - */ - COMMAND_NOT_SET, + /** + * Mode was enabled, but no value was set + */ + COMMAND_NOT_SET, - /** - * Configuration value out of range - */ - VALUE_OUT_OF_RANGE, + /** + * Configuration value out of range + */ + VALUE_OUT_OF_RANGE, - /** - * Path was not set - */ - PATH_NOT_SET, + /** + * Path was not set + */ + PATH_NOT_SET, - /** - * Illegal configuration value - */ - ILLEGAL_INPUT, + /** + * Illegal configuration value + */ + ILLEGAL_INPUT, - /** - * Illegal input file type - */ - ILLEGAL_INPUT_FILE, + /** + * Illegal input file type + */ + ILLEGAL_INPUT_FILE, - /** - * Required value is missing - */ - MISSING_VALUE + /** + * Required value is missing + */ + MISSING_VALUE } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java index 8c096242..436ec938 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItem.java @@ -20,62 +20,69 @@ /** * This class represents configuration verfication messages. */ -public class ConfigItem { +public class ConfigItem +{ - /** - * Type of message - */ - private final ConfigItemTypes type; + /** + * Type of message + */ + private final ConfigItemTypes type; - /** - * Type of error - */ - private final ConfigErrorKeys key; + /** + * Type of error + */ + private final ConfigErrorKeys key; - /** - * Message - */ - private final String message; + /** + * Message + */ + private final String message; - /** - * (Constructor) Creates a new ConfigItem - * - * @param type Type of message - * @param key Type of error - * @param message Message - */ - public ConfigItem(final ConfigItemTypes type, final ConfigErrorKeys key, - final String message) { + /** + * (Constructor) Creates a new ConfigItem + * + * @param type + * Type of message + * @param key + * Type of error + * @param message + * Message + */ + public ConfigItem(final ConfigItemTypes type, final ConfigErrorKeys key, final String message) + { - this.type = type; - this.key = key; - this.message = message; - } + this.type = type; + this.key = key; + this.message = message; + } - /** - * Returns the type of error. - * - * @return type of error - */ - public ConfigErrorKeys getKey() { - return key; - } + /** + * Returns the type of error. + * + * @return type of error + */ + public ConfigErrorKeys getKey() + { + return key; + } - /** - * Returns the message. - * - * @return message - */ - public String getMessage() { - return message; - } + /** + * Returns the message. + * + * @return message + */ + public String getMessage() + { + return message; + } - /** - * Returns the item type. - * - * @return item type - */ - public ConfigItemTypes getType() { - return type; - } + /** + * Returns the item type. + * + * @return item type + */ + public ConfigItemTypes getType() + { + return type; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java index 7c89f04d..b8627e17 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/ConfigItemTypes.java @@ -20,15 +20,16 @@ /** * Contains the keys for the different types of configuration items. */ -public enum ConfigItemTypes { +public enum ConfigItemTypes +{ - /** - * Warning message - */ - WARNING, + /** + * Warning message + */ + WARNING, - /** - * Error message - */ - ERROR + /** + * Error message + */ + ERROR } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java index d729ae46..6c2eb152 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/OutputCompressionEnum.java @@ -20,20 +20,21 @@ /** * Contains the keys for the different types of the DiffTool outputs. */ -public enum OutputCompressionEnum { +public enum OutputCompressionEnum +{ - /** - * Uncompressed output type - */ - None, + /** + * Uncompressed output type + */ + None, - /** - * SevenZip output type - */ - SevenZip, + /** + * SevenZip output type + */ + SevenZip, - /** - * BZip2 output type - */ - BZip2, + /** + * BZip2 output type + */ + BZip2, } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java index 84339a96..92c1f84f 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/data/PanelKeys.java @@ -20,51 +20,52 @@ /** * Contains the keys for the different panels of the ConfigurationTool. */ -public enum PanelKeys { +public enum PanelKeys +{ - /** - * Key for the mode panel - */ - PANEL_VALUES, + /** + * Key for the mode panel + */ + PANEL_VALUES, - /** - * Key for the externals panel - */ - PANEL_EXTERNALS, + /** + * Key for the externals panel + */ + PANEL_EXTERNALS, - /** - * Key for the input panel - */ - PANEL_INPUT, + /** + * Key for the input panel + */ + PANEL_INPUT, - /** - * Key for the output panel - */ - PANEL_OUTPUT, + /** + * Key for the output panel + */ + PANEL_OUTPUT, - /** - * Key for the sql panel - */ - PANEL_SQL, + /** + * Key for the sql panel + */ + PANEL_SQL, - /** - * Key for the cache panel - */ - PANEL_CACHE, + /** + * Key for the cache panel + */ + PANEL_CACHE, - /** - * Key for the logging panel - */ - PANEL_LOGGING, + /** + * Key for the logging panel + */ + PANEL_LOGGING, - /** - * Key for the debug panel - */ - PANEL_DEBUG, + /** + * Key for the debug panel + */ + PANEL_DEBUG, - /** - * Key for the filter panel - */ - PANEL_FILTER + /** + * Key for the filter panel + */ + PANEL_FILTER } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java index e0e1cc7c..34a35dae 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/ConfigDialog.java @@ -37,152 +37,164 @@ */ @SuppressWarnings("serial") public class ConfigDialog - extends JDialog { - - /** - * Panel of the ConfigDialog - */ - private class ConfigDialogPanel - extends AbstractPanel { - - private JTable itemTable; - private JScrollPane itemScrollPane; - - private JButton returnButton; - private JButton saveButton; + extends JDialog +{ /** - * (Constructor) Creates the ConfigDialogPanel. - * - * @param controller Reference to the controller + * Panel of the ConfigDialog */ - public ConfigDialogPanel(final ConfigController controller) { - super(controller); - createItemTable(); - createButtons(); - } + private class ConfigDialogPanel + extends AbstractPanel + { + + private JTable itemTable; + private JScrollPane itemScrollPane; + + private JButton returnButton; + private JButton saveButton; + + /** + * (Constructor) Creates the ConfigDialogPanel. + * + * @param controller + * Reference to the controller + */ + public ConfigDialogPanel(final ConfigController controller) + { + super(controller); + createItemTable(); + createButtons(); + } - /** - * Creates the buttons of the dialog panel. - */ - private void createButtons() { + /** + * Creates the buttons of the dialog panel. + */ + private void createButtons() + { - returnButton = new JButton("Return"); - returnButton.setBounds(105, 195, 120, 25); - returnButton.addActionListener(e -> close()); + returnButton = new JButton("Return"); + returnButton.setBounds(105, 195, 120, 25); + returnButton.addActionListener(e -> close()); - this.add(returnButton); + this.add(returnButton); - saveButton = new JButton("Save"); - saveButton.setBounds(235, 195, 120, 25); - saveButton.addActionListener(e -> { + saveButton = new JButton("Save"); + saveButton.setBounds(235, 195, 120, 25); + saveButton.addActionListener(e -> { - XMLFileChooser fc = new XMLFileChooser(); - if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { + XMLFileChooser fc = new XMLFileChooser(); + if (fc.showSaveDialog(new JPanel()) == XMLFileChooser.APPROVE_OPTION) { - String path = fc.getSelectedFile().getPath(); - if (path.indexOf('.') == -1) { - path += ".xml"; - } + String path = fc.getSelectedFile().getPath(); + if (path.indexOf('.') == -1) { + path += ".xml"; + } - if (controller.saveConfiguration(path)) { - System.out.println("SAVE CONFIG SUCCESSFULL"); - } else { - System.out.println("SAVE CONFIG FAILED"); - } + if (controller.saveConfiguration(path)) { + System.out.println("SAVE CONFIG SUCCESSFULL"); + } + else { + System.out.println("SAVE CONFIG FAILED"); + } + } + }); + + this.add(saveButton); } - }); - this.add(saveButton); - } + /** + * Creates the JTable for displaying the input archives. + */ + private void createItemTable() + { + itemTable = new JTable(controller.getConfigErrors()); + itemTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); - /** - * Creates the JTable for displaying the input archives. - */ - private void createItemTable() { - itemTable = new JTable(controller.getConfigErrors()); - itemTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); + itemScrollPane = new JScrollPane(itemTable); + itemScrollPane.setBounds(10, 10, 470, 180); - itemScrollPane = new JScrollPane(itemTable); - itemScrollPane.setBounds(10, 10, 470, 180); + this.add(itemScrollPane); + } - this.add(itemScrollPane); - } + /** + * empty method + */ + @Override + public void relocate() + { - /** - * empty method - */ - @Override - public void relocate() { + } - } + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void validate() + { + + ConfigVerification verification = controller.getConfigErrors(); + if (verification != null) { + saveButton.setEnabled(!verification.hasFailed()); + } + else { + saveButton.setEnabled(false); + } + } - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void validate() { - - ConfigVerification verification = controller.getConfigErrors(); - if (verification != null) { - saveButton.setEnabled(!verification.hasFailed()); - } else { - saveButton.setEnabled(false); - } + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + throw new UnsupportedOperationException(); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) + { + throw new UnsupportedOperationException(); + } } /** - * empty method + * (Constructor) Creates a new ConfigDialog. * - * @throws UnsupportedOperationException - * @deprecated + * @param controller + * Reference to the controller */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - throw new UnsupportedOperationException(); + public ConfigDialog(final ConfigController controller) + { + super(controller.getRegistry().getGUI(), true); + + this.setTitle("Verification"); + + setSize(500, 250); + setResizable(false); + + Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); + setLocation((d.width - getSize().width) / 2, (d.height - getSize().height) / 2); + + this.setContentPane(new ConfigDialogPanel(controller)); } /** - * empty method - * - * @throws UnsupportedOperationException - * @deprecated + * Closes the dialog. */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) { - throw new UnsupportedOperationException(); + public void close() + { + this.setVisible(true); + this.dispose(); } - } - - /** - * (Constructor) Creates a new ConfigDialog. - * - * @param controller Reference to the controller - */ - public ConfigDialog(final ConfigController controller) { - super(controller.getRegistry().getGUI(), true); - - this.setTitle("Verification"); - - setSize(500, 250); - setResizable(false); - - Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); - setLocation((d.width - getSize().width) / 2, - (d.height - getSize().height) / 2); - - this.setContentPane(new ConfigDialogPanel(controller)); - } - - /** - * Closes the dialog. - */ - public void close() { - this.setVisible(true); - this.dispose(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java index 70358836..893ccc39 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/InputDialog.java @@ -40,214 +40,226 @@ */ @SuppressWarnings("serial") public class InputDialog - extends JDialog { - - /** - * Panel of the InputDialog - */ - private class InputDialogPanel - extends AbstractPanel { + extends JDialog +{ /** - * (Constructor) Creates the InputDialogPanel. - * - * @param controller Reference to the controller + * Panel of the InputDialog */ - public InputDialogPanel(final ConfigController controller) { - super(controller); - createPathSettings(); - createTypeChooser(); - createButtons(); - createStartLabel(); - } + private class InputDialogPanel + extends AbstractPanel + { + + /** + * (Constructor) Creates the InputDialogPanel. + * + * @param controller + * Reference to the controller + */ + public InputDialogPanel(final ConfigController controller) + { + super(controller); + createPathSettings(); + createTypeChooser(); + createButtons(); + createStartLabel(); + } - private JLabel pathLabel; - private JTextField pathField; - private JButton searchButton; + private JLabel pathLabel; + private JTextField pathField; + private JButton searchButton; - private JLabel typeLabel; - private JComboBox typeChooser; + private JLabel typeLabel; + private JComboBox typeChooser; - private JLabel startLabel; - private JTextField startPosition; + private JLabel startLabel; + private JTextField startPosition; - private JButton addButton; - private JButton cancelButton; + private JButton addButton; + private JButton cancelButton; - /** - * Creates the path input components. - */ - private void createPathSettings() { - pathLabel = new JLabel("Please enter the path: "); - pathLabel.setBounds(10, 10, 150, 25); - this.add(pathLabel); + /** + * Creates the path input components. + */ + private void createPathSettings() + { + pathLabel = new JLabel("Please enter the path: "); + pathLabel.setBounds(10, 10, 150, 25); + this.add(pathLabel); + + pathField = new JTextField(); + pathField.setBounds(10, 40, 250, 25); + this.add(pathField); - pathField = new JTextField(); - pathField.setBounds(10, 40, 250, 25); - this.add(pathField); + searchButton = new JButton("Search"); + searchButton.setBounds(180, 10, 80, 25); - searchButton = new JButton("Search"); - searchButton.setBounds(180, 10, 80, 25); + searchButton.addActionListener(e -> { - searchButton.addActionListener(e -> { + JFileChooser fc = new JFileChooser(); + if (fc.showOpenDialog(new JPanel()) == JFileChooser.APPROVE_OPTION) { + pathField.setText(fc.getSelectedFile().getPath()); + } + }); - JFileChooser fc = new JFileChooser(); - if (fc.showOpenDialog(new JPanel()) == JFileChooser.APPROVE_OPTION) { - pathField.setText(fc.getSelectedFile().getPath()); + this.add(searchButton); } - }); - this.add(searchButton); - } + /** + * Creates the start input components. + */ + private void createStartLabel() + { - /** - * Creates the start input components. - */ - private void createStartLabel() { + startLabel = new JLabel("Ignore all bytes before:"); + startLabel.setBounds(10, 120, 130, 25); + this.add(startLabel); - startLabel = new JLabel("Ignore all bytes before:"); - startLabel.setBounds(10, 120, 130, 25); - this.add(startLabel); + startPosition = new JTextField(); + startPosition.setBounds(150, 120, 110, 25); + this.add(startPosition); + } - startPosition = new JTextField(); - startPosition.setBounds(150, 120, 110, 25); - this.add(startPosition); - } + /** + * Creates the input type chooser. + */ + private void createTypeChooser() + { - /** - * Creates the input type chooser. - */ - private void createTypeChooser() { + typeLabel = new JLabel("Input type: "); + typeLabel.setBounds(10, 80, 130, 25); + this.add(typeLabel); - typeLabel = new JLabel("Input type: "); - typeLabel.setBounds(10, 80, 130, 25); - this.add(typeLabel); + typeChooser = new JComboBox<>(); + typeChooser.setBounds(150, 80, 110, 25); - typeChooser = new JComboBox<>(); - typeChooser.setBounds(150, 80, 110, 25); + typeChooser.addItem(InputType.XML); - typeChooser.addItem(InputType.XML); + if (this.controller.is7ZipEnabled()) { + typeChooser.addItem(InputType.SEVENZIP); + } - if (this.controller.is7ZipEnabled()) { - typeChooser.addItem(InputType.SEVENZIP); - } + typeChooser.addItem(InputType.BZIP2); - typeChooser.addItem(InputType.BZIP2); + this.add(typeChooser); + } - this.add(typeChooser); - } + /** + * Creates the buttons of the dialog panel. + */ + private void createButtons() + { + addButton = new JButton("Add"); + addButton.setBounds(10, 170, 120, 25); + addButton.addActionListener(e -> { + String path = pathField.getText(); + if (path.length() == 0) { + return; + } + + InputType type = (InputType) typeChooser.getSelectedItem(); + + controller.addArchive(new ArchiveDescription(type, path)); + controller.repaint(); + + close(); + }); + this.add(addButton); + + cancelButton = new JButton("Cancel"); + cancelButton.setBounds(140, 170, 120, 25); + cancelButton.addActionListener(e -> close()); + + this.add(cancelButton); + } + + /** + * empty method + */ + @Override + public void validate() + { - /** - * Creates the buttons of the dialog panel. - */ - private void createButtons() { - addButton = new JButton("Add"); - addButton.setBounds(10, 170, 120, 25); - addButton.addActionListener(e -> { - String path = pathField.getText(); - if (path.length() == 0) { - return; } - InputType type = (InputType) typeChooser.getSelectedItem(); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { - controller.addArchive(new ArchiveDescription(type, path)); - controller.repaint(); + int w = 250, h = 185; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - close(); - }); - this.add(addButton); + pathLabel.setLocation(x, y); + pathField.setLocation(x, y + 30); + searchButton.setLocation(x + 170, y); - cancelButton = new JButton("Cancel"); - cancelButton.setBounds(140, 170, 120, 25); - cancelButton.addActionListener(e -> close()); + typeLabel.setLocation(x, y + 70); + typeChooser.setLocation(x + 140, y + 70); - this.add(cancelButton); - } + startLabel.setLocation(x, y + 110); + startPosition.setLocation(x + 140, y + 110); - /** - * empty method - */ - @Override - public void validate() { + addButton.setLocation(x, y + 160); + cancelButton.setLocation(x + 130, y + 160); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + throw new UnsupportedOperationException(); + } + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) + { + throw new UnsupportedOperationException(); + } } /** - * A call of this method should validate the positions of the panels - * components. + * (Constructor) Creates a new InputDialog. + * + * @param controller + * Reference to the controller */ - @Override - public void relocate() { - - int w = 250, h = 185; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + public InputDialog(final ConfigController controller) + { + super(controller.getRegistry().getGUI(), true); - pathLabel.setLocation(x, y); - pathField.setLocation(x, y + 30); - searchButton.setLocation(x + 170, y); + this.setTitle("Add an input file"); - typeLabel.setLocation(x, y + 70); - typeChooser.setLocation(x + 140, y + 70); + setSize(300, 250); + setResizable(false); - startLabel.setLocation(x, y + 110); - startPosition.setLocation(x + 140, y + 110); - - addButton.setLocation(x, y + 160); - cancelButton.setLocation(x + 130, y + 160); - } + Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); + setLocation((d.width - getSize().width) / 2, (d.height - getSize().height) / 2); - /** - * empty method - * - * @throws UnsupportedOperationException - * @deprecated - */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - throw new UnsupportedOperationException(); + this.setContentPane(new InputDialogPanel(controller)); } /** - * empty method - * - * @throws UnsupportedOperationException - * @deprecated + * Closes the dialog. */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) { - throw new UnsupportedOperationException(); + public void close() + { + this.setVisible(true); + this.dispose(); } - } - - /** - * (Constructor) Creates a new InputDialog. - * - * @param controller Reference to the controller - */ - public InputDialog(final ConfigController controller) { - super(controller.getRegistry().getGUI(), true); - - this.setTitle("Add an input file"); - - setSize(300, 250); - setResizable(false); - - Dimension d = Toolkit.getDefaultToolkit().getScreenSize(); - setLocation((d.width - getSize().width) / 2, - (d.height - getSize().height) / 2); - - this.setContentPane(new InputDialogPanel(controller)); - } - - /** - * Closes the dialog. - */ - public void close() { - this.setVisible(true); - this.dispose(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java index 76ce433c..34aa9fcc 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/dialogs/XMLFileChooser.java @@ -27,38 +27,43 @@ */ @SuppressWarnings("serial") public class XMLFileChooser - extends JFileChooser { + extends JFileChooser +{ - /** - * (Constructor) Creates an FileChooser with a xml file filter. - */ - public XMLFileChooser() { + /** + * (Constructor) Creates an FileChooser with a xml file filter. + */ + public XMLFileChooser() + { - setFileFilter(new FileFilter() { + setFileFilter(new FileFilter() + { - @Override - public String getDescription() { - return ".xml"; - } + @Override + public String getDescription() + { + return ".xml"; + } - @Override - public boolean accept(final File f) { + @Override + public boolean accept(final File f) + { - // Always accept directories - if (f.isDirectory()) { - return true; - } + // Always accept directories + if (f.isDirectory()) { + return true; + } - int p = f.getName().indexOf("."); + int p = f.getName().indexOf("."); - // Files need a ending - if (p == -1) { - return false; - } + // Files need a ending + if (p == -1) { + return false; + } - // Verify the ending - return f.getName().substring(p).equals(".xml"); - } - }); - } + // Verify the ending + return f.getName().substring(p).equals(".xml"); + } + }); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java index 5be25e12..03fb16fe 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/AbstractPanel.java @@ -28,72 +28,74 @@ /** * AbstractPanel Super panel class of the KonfigurationTool *

- * All panels (which contain configuration parameters) will inherit from this - * class. + * All panels (which contain configuration parameters) will inherit from this class. */ @SuppressWarnings("serial") public abstract class AbstractPanel - extends JPanel { + extends JPanel +{ - /** - * Reference to the controller - */ - protected final ConfigController controller; + /** + * Reference to the controller + */ + protected final ConfigController controller; - /** - * (Constructor) Creates an AbstractPanel object. - * - * @param controller Reference to the controller - */ - public AbstractPanel(final ConfigController controller) { - this.controller = controller; - this.setLayout(null); - } + /** + * (Constructor) Creates an AbstractPanel object. + * + * @param controller + * Reference to the controller + */ + public AbstractPanel(final ConfigController controller) + { + this.controller = controller; + this.setLayout(null); + } - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public abstract void validate(); + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public abstract void validate(); - /** - * A call of this method should validate the positions of the panels - * components. - */ - public abstract void relocate(); + /** + * A call of this method should validate the positions of the panels components. + */ + public abstract void relocate(); - /** - * The default paint method was expanded with calls of the validate() and - * relocate() methods. - * - * @param g Graphics - */ - @Override - public void paint(final Graphics g) { + /** + * The default paint method was expanded with calls of the validate() and relocate() methods. + * + * @param g + * Graphics + */ + @Override + public void paint(final Graphics g) + { - validate(); - relocate(); + validate(); + relocate(); - super.paint(g); - } + super.paint(g); + } - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - public abstract void toXML(final StringBuilder builder, - final ConfigVerification errors); + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + public abstract void toXML(final StringBuilder builder, final ConfigVerification errors); - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - public abstract void applyConfig(final ConfigSettings config); + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + public abstract void applyConfig(final ConfigSettings config); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java index 9079f9ab..3605ad30 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/CachePanel.java @@ -33,253 +33,250 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the cache. + * This panel contains all components for setting configuration parameters related to the cache. */ @SuppressWarnings("serial") public class CachePanel - extends AbstractPanel { + extends AbstractPanel +{ - private JLabel taskLimitationsLabel; + private JLabel taskLimitationsLabel; - private JLabel articleTaskLabel; - private JTextField articleTaskLimitField; + private JLabel articleTaskLabel; + private JTextField articleTaskLimitField; - private JLabel diffTaskLabel; - private JTextField diffTaskLimitField; + private JLabel diffTaskLabel; + private JTextField diffTaskLimitField; - private JLabel sqlProducerLimitationsLabel; + private JLabel sqlProducerLimitationsLabel; - private JLabel maxAllowedPacketLabel; - private JTextField maxAllowedPacketField; + private JLabel maxAllowedPacketLabel; + private JTextField maxAllowedPacketField; - /** - * (Constructor) Creates a new CachePanel. - * - * @param controller Reference to the controller - */ - public CachePanel(final ConfigController controller) { - - super(controller); - controller.register(PanelKeys.PANEL_CACHE, this); - - createTaskSettings(); - createSQLProducerSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private void createTaskSettings() { - - taskLimitationsLabel = new JLabel("Task Limitations (in byte)"); - taskLimitationsLabel.setBounds(10, 10, 250, 25); - this.add(taskLimitationsLabel); - - articleTaskLabel = new JLabel("Article-Task: "); - articleTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - articleTaskLabel.setBounds(10, 40, 100, 25); - this.add(articleTaskLabel); - - articleTaskLimitField = new JTextField(); - articleTaskLimitField.setBounds(120, 40, 200, 25); - this.add(articleTaskLimitField); - - diffTaskLabel = new JLabel("Diff-Task: "); - diffTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - diffTaskLabel.setBounds(10, 70, 100, 25); - this.add(diffTaskLabel); - - diffTaskLimitField = new JTextField(); - diffTaskLimitField.setBounds(120, 70, 200, 25); - this.add(diffTaskLimitField); - } - - - private void createSQLProducerSettings() { - - sqlProducerLimitationsLabel = new JLabel( - "SQLProducer Limitations (in byte)"); - sqlProducerLimitationsLabel.setBounds(10, 210, 250, 25); - this.add(sqlProducerLimitationsLabel); - - maxAllowedPacketLabel = new JLabel("MAX_ALLOWED_PACKET"); - maxAllowedPacketLabel - .setBorder(BorderFactory.createRaisedBevelBorder()); - maxAllowedPacketLabel.setBounds(10, 240, 160, 25); - this.add(maxAllowedPacketLabel); - - maxAllowedPacketField = new JTextField(); - maxAllowedPacketField.setBounds(180, 240, 140, 25); - this.add(maxAllowedPacketField); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { - - int w = 310, h = 255; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - taskLimitationsLabel.setLocation(x, y); - articleTaskLabel.setLocation(x, y + 30); - articleTaskLimitField.setLocation(x + 110, y + 30); - diffTaskLabel.setLocation(x, y + 60); - diffTaskLimitField.setLocation(x + 110, y + 60); - - sqlProducerLimitationsLabel.setLocation(x, y + 100); - maxAllowedPacketLabel.setLocation(x, y + 130); - maxAllowedPacketField.setLocation(x + 170, y + 130); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { - - Object o = config - .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); - if (o != null) { - this.articleTaskLimitField.setText(Long.toString((Long) o)); - } else { - this.articleTaskLimitField.setText(""); + /** + * (Constructor) Creates a new CachePanel. + * + * @param controller + * Reference to the controller + */ + public CachePanel(final ConfigController controller) + { + + super(controller); + controller.register(PanelKeys.PANEL_CACHE, this); + + createTaskSettings(); + createSQLProducerSettings(); } - o = config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); - if (o != null) { - this.diffTaskLimitField.setText(Long.toString((Long) o)); - } else { - this.diffTaskLimitField.setText(""); + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private void createTaskSettings() + { + + taskLimitationsLabel = new JLabel("Task Limitations (in byte)"); + taskLimitationsLabel.setBounds(10, 10, 250, 25); + this.add(taskLimitationsLabel); + + articleTaskLabel = new JLabel("Article-Task: "); + articleTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + articleTaskLabel.setBounds(10, 40, 100, 25); + this.add(articleTaskLabel); + + articleTaskLimitField = new JTextField(); + articleTaskLimitField.setBounds(120, 40, 200, 25); + this.add(articleTaskLimitField); + + diffTaskLabel = new JLabel("Diff-Task: "); + diffTaskLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + diffTaskLabel.setBounds(10, 70, 100, 25); + this.add(diffTaskLabel); + + diffTaskLimitField = new JTextField(); + diffTaskLimitField.setBounds(120, 70, 200, 25); + this.add(diffTaskLimitField); } - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); - if (o != null) { - this.maxAllowedPacketField.setText(Long.toString((Long) o)); - } else { - this.maxAllowedPacketField.setText(""); + private void createSQLProducerSettings() + { + + sqlProducerLimitationsLabel = new JLabel("SQLProducer Limitations (in byte)"); + sqlProducerLimitationsLabel.setBounds(10, 210, 250, 25); + this.add(sqlProducerLimitationsLabel); + + maxAllowedPacketLabel = new JLabel("MAX_ALLOWED_PACKET"); + maxAllowedPacketLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + maxAllowedPacketLabel.setBounds(10, 240, 160, 25); + this.add(maxAllowedPacketLabel); + + maxAllowedPacketField = new JTextField(); + maxAllowedPacketField.setBounds(180, 240, 140, 25); + this.add(maxAllowedPacketField); } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - long tasksizeRevisions = -1, tasksizeDiffs = -1, maxAllowedPacket = -1; - - // Check the ArticleTask size input - String text = this.articleTaskLimitField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for the size of ArticleTasks" + " is missing.")); - } else { - try { - tasksizeRevisions = Long.parseLong(text); - if (tasksizeRevisions < 1000000) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value for the size of an " - + "ArticleTask has to be at least " - + "1000000 Byte.")); - } - } catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the size of" - + " ArticleTasks")); - } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { + } - // Check the DiffTask size input - text = this.diffTaskLimitField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for the size of DiffTasks" + " is missing.")); - } else { - try { - tasksizeDiffs = Long.parseLong(text); - if (tasksizeDiffs < 1000000) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value for the size of a DiffTask " - + "has to be at least 1000000 Byte.")); - } - } catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the size of" + " DiffTasks")); - } + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { + + int w = 310, h = 255; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + taskLimitationsLabel.setLocation(x, y); + articleTaskLabel.setLocation(x, y + 30); + articleTaskLimitField.setLocation(x + 110, y + 30); + diffTaskLabel.setLocation(x, y + 60); + diffTaskLimitField.setLocation(x + 110, y + 60); + + sqlProducerLimitationsLabel.setLocation(x, y + 100); + maxAllowedPacketLabel.setLocation(x, y + 130); + maxAllowedPacketField.setLocation(x + 170, y + 130); } - // Check the SQLProducer MaxAllowedPacket input - text = this.maxAllowedPacketField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for SQLProducer MaxAllowedPacket" - + " is missing.")); - } else { - try { - maxAllowedPacket = Long.parseLong(text); - if (maxAllowedPacket < 1000000) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value for SQLProducer " - + "MaxAllowedPacket should be at least" - + " 1000000 Byte.")); + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + + Object o = config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); + if (o != null) { + this.articleTaskLimitField.setText(Long.toString((Long) o)); + } + else { + this.articleTaskLimitField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); + if (o != null) { + this.diffTaskLimitField.setText(Long.toString((Long) o)); + } + else { + this.diffTaskLimitField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); + if (o != null) { + this.maxAllowedPacketField.setText(Long.toString((Long) o)); + } + else { + this.maxAllowedPacketField.setText(""); } - } catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the size of" - + " SQLProducer MaxAllowedPacket")); - } } - builder.append("\t\r\n"); - builder.append("\t\t" + tasksizeRevisions - + "\r\n"); - builder.append("\t\t" + tasksizeDiffs - + "\r\n"); - builder.append("\t\t" - + maxAllowedPacket - + "\r\n"); - - builder.append("\t\r\n"); - } + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + + long tasksizeRevisions = -1, tasksizeDiffs = -1, maxAllowedPacket = -1; + + // Check the ArticleTask size input + String text = this.articleTaskLimitField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The value for the size of ArticleTasks" + " is missing.")); + } + else { + try { + tasksizeRevisions = Long.parseLong(text); + if (tasksizeRevisions < 1000000) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, "The value for the size of an " + + "ArticleTask has to be at least " + "1000000 Byte.")); + } + } + catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the size of" + " ArticleTasks")); + } + } + + // Check the DiffTask size input + text = this.diffTaskLimitField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The value for the size of DiffTasks" + " is missing.")); + } + else { + try { + tasksizeDiffs = Long.parseLong(text); + if (tasksizeDiffs < 1000000) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The value for the size of a DiffTask " + + "has to be at least 1000000 Byte.")); + } + } + catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the size of" + " DiffTasks")); + } + } + + // Check the SQLProducer MaxAllowedPacket input + text = this.maxAllowedPacketField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The value for SQLProducer MaxAllowedPacket" + " is missing.")); + } + else { + try { + maxAllowedPacket = Long.parseLong(text); + if (maxAllowedPacket < 1000000) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, "The value for SQLProducer " + + "MaxAllowedPacket should be at least" + " 1000000 Byte.")); + } + } + catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the size of" + " SQLProducer MaxAllowedPacket")); + } + } + + builder.append("\t\r\n"); + builder.append("\t\t" + tasksizeRevisions + + "\r\n"); + builder.append( + "\t\t" + tasksizeDiffs + "\r\n"); + builder.append("\t\t" + maxAllowedPacket + + "\r\n"); + + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java index 5b79dbe1..5598a4ab 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ConfigPanel.java @@ -25,150 +25,158 @@ import org.dkpro.jwpl.revisionmachine.difftool.config.gui.control.ConfigVerification; /** - * Panel of the ConfigGUI Contains a tabbed panel with reference to all the - * other panels. + * Panel of the ConfigGUI Contains a tabbed panel with reference to all the other panels. */ @SuppressWarnings("serial") public class ConfigPanel - extends AbstractPanel { + extends AbstractPanel +{ - private JTabbedPane tabs; + private JTabbedPane tabs; - private JButton importButton; - private JButton verifyButton; - private JButton exportButton; + private JButton importButton; + private JButton verifyButton; + private JButton exportButton; - /** - * (Constructor) Creates a new ConfigPanel. - * - * @param controller Reference to the controller - */ - public ConfigPanel(final ConfigController controller) { + /** + * (Constructor) Creates a new ConfigPanel. + * + * @param controller + * Reference to the controller + */ + public ConfigPanel(final ConfigController controller) + { - super(controller); + super(controller); - createTabbedPane(); + createTabbedPane(); - createImportButton(); - createVerifyButton(); - createExportButton(); - } + createImportButton(); + createVerifyButton(); + createExportButton(); + } - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// - private void createTabbedPane() { + private void createTabbedPane() + { - tabs = new JTabbedPane(); - tabs.setBounds(5, 5, 580, 300); + tabs = new JTabbedPane(); + tabs.setBounds(5, 5, 580, 300); - tabs.add("Mode", new ModePanel(controller)); - tabs.add("Externals", new ExternalProgramsPanel(controller)); - tabs.add("Input", new InputPanel(controller)); - tabs.add("Output", new OutputPanel(controller)); - tabs.add("Database", new SQLPanel(controller)); - tabs.add("Cache", new CachePanel(controller)); - tabs.add("Logging", new LoggingPanel(controller)); - tabs.add("Debug", new DebugPanel(controller)); - tabs.add("Filter", new FilterPanel(controller)); + tabs.add("Mode", new ModePanel(controller)); + tabs.add("Externals", new ExternalProgramsPanel(controller)); + tabs.add("Input", new InputPanel(controller)); + tabs.add("Output", new OutputPanel(controller)); + tabs.add("Database", new SQLPanel(controller)); + tabs.add("Cache", new CachePanel(controller)); + tabs.add("Logging", new LoggingPanel(controller)); + tabs.add("Debug", new DebugPanel(controller)); + tabs.add("Filter", new FilterPanel(controller)); - this.add(tabs); + this.add(tabs); - } + } - private void createImportButton() { + private void createImportButton() + { - importButton = new JButton("Import"); - importButton.setBounds(5, 310, 190, 25); + importButton = new JButton("Import"); + importButton.setBounds(5, 310, 190, 25); - importButton.addActionListener(e -> { - controller.loadConfiguration(); - repaint(); - }); + importButton.addActionListener(e -> { + controller.loadConfiguration(); + repaint(); + }); - this.add(importButton); + this.add(importButton); - } + } - private void createVerifyButton() { + private void createVerifyButton() + { - verifyButton = new JButton("Verify Settings"); - verifyButton.setBounds(200, 310, 190, 25); + verifyButton = new JButton("Verify Settings"); + verifyButton.setBounds(200, 310, 190, 25); - verifyButton.addActionListener(e -> { - controller.createConfigurationXML(); - repaint(); - }); + verifyButton.addActionListener(e -> { + controller.createConfigurationXML(); + repaint(); + }); - this.add(verifyButton); - } + this.add(verifyButton); + } - private void createExportButton() { + private void createExportButton() + { - exportButton = new JButton("Export"); - exportButton.setBounds(395, 310, 190, 25); + exportButton = new JButton("Export"); + exportButton.setBounds(395, 310, 190, 25); - exportButton.addActionListener(e -> { - controller.saveConfiguration(); - repaint(); - }); + exportButton.addActionListener(e -> { + controller.saveConfiguration(); + repaint(); + }); - this.add(exportButton); - } + this.add(exportButton); + } - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// - /** - * empty method - */ - @Override - public void validate() { + /** + * empty method + */ + @Override + public void validate() + { - } + } + + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { + int w = 575, h = 330; - int w = 575, h = 330; + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; + tabs.setLocation(x, y); - tabs.setLocation(x, y); + importButton.setLocation(x, y + 305); + verifyButton.setLocation(x + 195, y + 305); + exportButton.setLocation(x + 390, y + 305); - importButton.setLocation(x, y + 305); - verifyButton.setLocation(x + 195, y + 305); - exportButton.setLocation(x + 390, y + 305); + } - } + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// + /** + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) + { + throw new UnsupportedOperationException(); + } - /** - * @deprecated - */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) { - throw new UnsupportedOperationException(); - } - - /** - * @deprecated - */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - throw new UnsupportedOperationException(); - } + /** + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + throw new UnsupportedOperationException(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java index ea21ea18..e92114db 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/DebugPanel.java @@ -36,262 +36,269 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the debug purposes. + * This panel contains all components for setting configuration parameters related to the debug + * purposes. */ @SuppressWarnings("serial") public class DebugPanel - extends AbstractPanel { + extends AbstractPanel +{ - private JCheckBox verifyDiffCheckBox; - private JCheckBox verifyEncodingCheckBox; + private JCheckBox verifyDiffCheckBox; + private JCheckBox verifyEncodingCheckBox; - private JCheckBox debugOuputCheckBox; - private JLabel debugOutputLabel; - private JTextField debugOutputField; + private JCheckBox debugOuputCheckBox; + private JLabel debugOutputLabel; + private JTextField debugOutputField; - private JCheckBox statsOutputCheckBox; + private JCheckBox statsOutputCheckBox; - /** - * (Constructor) Creates a new DebugPanel. - * - * @param controller Reference to the controller - */ - public DebugPanel(final ConfigController controller) { + /** + * (Constructor) Creates a new DebugPanel. + * + * @param controller + * Reference to the controller + */ + public DebugPanel(final ConfigController controller) + { - super(controller); - controller.register(PanelKeys.PANEL_DEBUG, this); + super(controller); + controller.register(PanelKeys.PANEL_DEBUG, this); - createVerificationSettings(); - createStatsOutputSettings(); - createDebugSettings(); - } + createVerificationSettings(); + createStatsOutputSettings(); + createDebugSettings(); + } - public void createVerificationSettings() { + public void createVerificationSettings() + { - verifyDiffCheckBox = new JCheckBox("Activate Diff Verification"); - verifyDiffCheckBox.setBounds(10, 10, 200, 25); + verifyDiffCheckBox = new JCheckBox("Activate Diff Verification"); + verifyDiffCheckBox.setBounds(10, 10, 200, 25); - verifyDiffCheckBox.addActionListener(e -> { + verifyDiffCheckBox.addActionListener(e -> { - boolean flag = !controller.isDiffVerificationEnabled(); - controller.setEnableDiffVerification(flag); + boolean flag = !controller.isDiffVerificationEnabled(); + controller.setEnableDiffVerification(flag); - validateDebugSettings(); - }); + validateDebugSettings(); + }); - this.add(verifyDiffCheckBox); + this.add(verifyDiffCheckBox); - verifyEncodingCheckBox = new JCheckBox("Activate Encoding Verification"); - verifyEncodingCheckBox.setBounds(10, 40, 200, 25); + verifyEncodingCheckBox = new JCheckBox("Activate Encoding Verification"); + verifyEncodingCheckBox.setBounds(10, 40, 200, 25); - verifyEncodingCheckBox.addActionListener(e -> { + verifyEncodingCheckBox.addActionListener(e -> { - boolean flag = !controller.isEncodingVerificationEnabled(); - controller.setEnableEncodingVerification(flag); + boolean flag = !controller.isEncodingVerificationEnabled(); + controller.setEnableEncodingVerification(flag); - validateDebugSettings(); - }); + validateDebugSettings(); + }); - this.add(verifyEncodingCheckBox); - } + this.add(verifyEncodingCheckBox); + } - private void createStatsOutputSettings() { - statsOutputCheckBox = new JCheckBox( - "Activate Article Information Output"); - statsOutputCheckBox.setBounds(10, 80, 250, 25); + private void createStatsOutputSettings() + { + statsOutputCheckBox = new JCheckBox("Activate Article Information Output"); + statsOutputCheckBox.setBounds(10, 80, 250, 25); - statsOutputCheckBox.addActionListener(e -> { + statsOutputCheckBox.addActionListener(e -> { - boolean flag = !controller.isStatsOutputEnabled(); - controller.setEnableStatsOutput(flag); - }); + boolean flag = !controller.isStatsOutputEnabled(); + controller.setEnableStatsOutput(flag); + }); - this.add(statsOutputCheckBox); - } + this.add(statsOutputCheckBox); + } - private void createDebugSettings() { + private void createDebugSettings() + { - debugOuputCheckBox = new JCheckBox("Activate Debug Output"); - debugOuputCheckBox.setBounds(10, 120, 200, 25); - this.add(debugOuputCheckBox); + debugOuputCheckBox = new JCheckBox("Activate Debug Output"); + debugOuputCheckBox.setBounds(10, 120, 200, 25); + this.add(debugOuputCheckBox); - debugOuputCheckBox.addActionListener(e -> { + debugOuputCheckBox.addActionListener(e -> { - boolean flag = !controller.isDebugOutputEnabled(); - controller.setEnableDebugOutput(flag); + boolean flag = !controller.isDebugOutputEnabled(); + controller.setEnableDebugOutput(flag); - validateDebugSettings(); - }); + validateDebugSettings(); + }); - debugOutputLabel = new JLabel("Debug Folder: "); - debugOutputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - debugOutputLabel.setBounds(10, 150, 100, 25); - this.add(debugOutputLabel); + debugOutputLabel = new JLabel("Debug Folder: "); + debugOutputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + debugOutputLabel.setBounds(10, 150, 100, 25); + this.add(debugOutputLabel); - debugOutputField = new JTextField(); - debugOutputField.setBounds(120, 150, 250, 25); - this.add(debugOutputField); - } + debugOutputField = new JTextField(); + debugOutputField.setBounds(120, 150, 250, 25); + this.add(debugOutputField); + } - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { + validateDebugSettings(); + } - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { - validateDebugSettings(); - } + /** + * Validates the debug settings. + */ + private void validateDebugSettings() + { - /** - * Validates the debug settings. - */ - private void validateDebugSettings() { + verifyDiffCheckBox.setSelected(controller.isDiffVerificationEnabled()); + verifyEncodingCheckBox.setSelected(controller.isEncodingVerificationEnabled()); + statsOutputCheckBox.setSelected(controller.isStatsOutputEnabled()); - verifyDiffCheckBox.setSelected(controller.isDiffVerificationEnabled()); - verifyEncodingCheckBox.setSelected(controller - .isEncodingVerificationEnabled()); - statsOutputCheckBox.setSelected(controller.isStatsOutputEnabled()); + boolean flagA = controller.isDiffVerificationEnabled() + || controller.isEncodingVerificationEnabled(); - boolean flagA = controller.isDiffVerificationEnabled() - || controller.isEncodingVerificationEnabled(); + debugOuputCheckBox.setEnabled(flagA); + debugOuputCheckBox.setSelected(controller.isDebugOutputEnabled()); - debugOuputCheckBox.setEnabled(flagA); - debugOuputCheckBox.setSelected(controller.isDebugOutputEnabled()); + boolean flagB = controller.isDebugOutputEnabled(); + debugOutputLabel.setEnabled(flagA && flagB); + debugOutputField.setEnabled(flagA && flagB); - boolean flagB = controller.isDebugOutputEnabled(); - debugOutputLabel.setEnabled(flagA && flagB); - debugOutputField.setEnabled(flagA && flagB); + } - } + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { + int w = 360, h = 165; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - int w = 360, h = 165; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + verifyDiffCheckBox.setLocation(x, y); + verifyEncodingCheckBox.setLocation(x, y + 30); - verifyDiffCheckBox.setLocation(x, y); - verifyEncodingCheckBox.setLocation(x, y + 30); + statsOutputCheckBox.setLocation(x, y + 70); - statsOutputCheckBox.setLocation(x, y + 70); - - debugOuputCheckBox.setLocation(x, y + 110); - debugOutputLabel.setLocation(x, y + 140); - debugOutputField.setLocation(x + 110, y + 140); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { - Object o = config - .getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); - if (o != null) { - controller.setEnableDiffVerification((Boolean) o); - } else { - controller.setEnableDiffVerification(false); + debugOuputCheckBox.setLocation(x, y + 110); + debugOutputLabel.setLocation(x, y + 140); + debugOutputField.setLocation(x + 110, y + 140); } - o = config.getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); - if (o != null) { - controller.setEnableEncodingVerification((Boolean) o); - } else { - controller.setEnableEncodingVerification(false); - } + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + Object o = config.getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); + if (o != null) { + controller.setEnableDiffVerification((Boolean) o); + } + else { + controller.setEnableDiffVerification(false); + } - o = config - .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - if (o != null) { - controller.setEnableStatsOutput((Boolean) o); - } else { - controller.setEnableStatsOutput(false); - } + o = config.getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); + if (o != null) { + controller.setEnableEncodingVerification((Boolean) o); + } + else { + controller.setEnableEncodingVerification(false); + } - o = config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); - if (o != null) { - controller.setEnableDebugOutput(true); - this.debugOutputField.setText((String) o); - } else { - controller.setEnableDebugOutput(false); - this.debugOutputField.setText(""); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - boolean verifyDiff = controller.isDiffVerificationEnabled(); - boolean verifyEncoding = controller.isEncodingVerificationEnabled(); - boolean statsOutput = controller.isStatsOutputEnabled(); - boolean debugOutput = controller.isDebugOutputEnabled(); - - if (verifyDiff || verifyEncoding || statsOutput || debugOutput) { - - builder.append("\t\r\n"); - - if (verifyDiff) { - builder.append("\t\t" + verifyDiff - + "\r\n"); - } - - if (verifyEncoding) { - builder.append("\t\t" + verifyEncoding - + "\r\n"); - } - - if (statsOutput) { - builder.append("\t\t" + statsOutput - + "\r\n"); - } - - builder.append("\t\t\r\n"); // \"" + path + - // "\"\r\n"); - builder.append("\t\t\t" + debugOutput + "\r\n"); - - if (debugOutput) { - - String path = debugOutputField.getText(); - if (path.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.PATH_NOT_SET, - "The folder of the debug output is not specified.")); + o = config.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + if (o != null) { + controller.setEnableStatsOutput((Boolean) o); } - if (!path.endsWith(File.separator) - && path.contains(File.separator)) { - path += File.separator; + else { + controller.setEnableStatsOutput(false); } - builder.append("\t\t\t\"" + path + "\"\r\n"); - } + o = config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); + if (o != null) { + controller.setEnableDebugOutput(true); + this.debugOutputField.setText((String) o); + } + else { + controller.setEnableDebugOutput(false); + this.debugOutputField.setText(""); + } + } - builder.append("\t\t\r\n"); - builder.append("\t\r\n"); + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + + boolean verifyDiff = controller.isDiffVerificationEnabled(); + boolean verifyEncoding = controller.isEncodingVerificationEnabled(); + boolean statsOutput = controller.isStatsOutputEnabled(); + boolean debugOutput = controller.isDebugOutputEnabled(); + + if (verifyDiff || verifyEncoding || statsOutput || debugOutput) { + + builder.append("\t\r\n"); + + if (verifyDiff) { + builder.append("\t\t" + verifyDiff + "\r\n"); + } + + if (verifyEncoding) { + builder.append("\t\t" + verifyEncoding + + "\r\n"); + } + + if (statsOutput) { + builder.append( + "\t\t" + statsOutput + "\r\n"); + } + + builder.append("\t\t\r\n"); // \"" + path + + // "\"\r\n"); + builder.append("\t\t\t" + debugOutput + "\r\n"); + + if (debugOutput) { + + String path = debugOutputField.getText(); + if (path.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, ConfigErrorKeys.PATH_NOT_SET, + "The folder of the debug output is not specified.")); + } + if (!path.endsWith(File.separator) && path.contains(File.separator)) { + path += File.separator; + } + + builder.append("\t\t\t\"" + path + "\"\r\n"); + } + + builder.append("\t\t\r\n"); + builder.append("\t\r\n"); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java index e34339fa..e0ca2b60 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ExternalProgramsPanel.java @@ -37,188 +37,194 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the use of external programs. + * This panel contains all components for setting configuration parameters related to the use of + * external programs. */ @SuppressWarnings("serial") public class ExternalProgramsPanel - extends AbstractPanel { + extends AbstractPanel +{ - private JLabel executablePathLabel; + private JLabel executablePathLabel; - private JCheckBox sevenZipEnableBox; - private JLabel sevenZipLabel; - private JTextField sevenZipPathField; - private JButton sevenZipSearchButton; + private JCheckBox sevenZipEnableBox; + private JLabel sevenZipLabel; + private JTextField sevenZipPathField; + private JButton sevenZipSearchButton; - /** - * (Constructor) Creates a new ExternalProgramPanel. - * - * @param controller Reference to the controller - */ - public ExternalProgramsPanel(final ConfigController controller) { + /** + * (Constructor) Creates a new ExternalProgramPanel. + * + * @param controller + * Reference to the controller + */ + public ExternalProgramsPanel(final ConfigController controller) + { - super(controller); - controller.register(PanelKeys.PANEL_EXTERNALS, this); + super(controller); + controller.register(PanelKeys.PANEL_EXTERNALS, this); - createExecutableSettings(); - } + createExecutableSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private void createExecutableSettings() + { + + executablePathLabel = new JLabel("Path to executables: "); + executablePathLabel.setBounds(10, 10, 250, 25); + this.add(executablePathLabel); + + // ------------------------------------------------------------------// + // 7ZIP / P7ZIP SETTINGS // + // ------------------------------------------------------------------// + + sevenZipEnableBox = new JCheckBox(); + sevenZipEnableBox.setBounds(10, 45, 25, 25); + + this.add(sevenZipEnableBox); + + sevenZipLabel = new JLabel("7Zip Executable: "); + sevenZipLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sevenZipLabel.setBounds(40, 45, 120, 25); + this.add(sevenZipLabel); + + sevenZipPathField = new JTextField(); + sevenZipPathField.setBounds(170, 45, 300, 25); + this.add(sevenZipPathField); + + sevenZipSearchButton = new JButton("Search"); + sevenZipSearchButton.setBounds(480, 45, 80, 25); + + sevenZipSearchButton.addActionListener(e -> { + + JFileChooser fc = new JFileChooser(); + if (fc.showOpenDialog(new JPanel()) == JFileChooser.APPROVE_OPTION) { + sevenZipPathField.setText(fc.getSelectedFile().getPath()); + } + }); + + this.add(sevenZipSearchButton); - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + sevenZipEnableBox.addActionListener(e -> { + boolean flag = !controller.is7ZipEnabled(); + controller.setEnable7Zip(flag); - private void createExecutableSettings() { + sevenZipLabel.setEnabled(flag); + sevenZipPathField.setEnabled(flag); + sevenZipSearchButton.setEnabled(flag); + }); - executablePathLabel = new JLabel("Path to executables: "); - executablePathLabel.setBounds(10, 10, 250, 25); - this.add(executablePathLabel); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { + validate7ZipSettings(); + } - // ------------------------------------------------------------------// - // 7ZIP / P7ZIP SETTINGS // - // ------------------------------------------------------------------// - - sevenZipEnableBox = new JCheckBox(); - sevenZipEnableBox.setBounds(10, 45, 25, 25); - - this.add(sevenZipEnableBox); - - sevenZipLabel = new JLabel("7Zip Executable: "); - sevenZipLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sevenZipLabel.setBounds(40, 45, 120, 25); - this.add(sevenZipLabel); - - sevenZipPathField = new JTextField(); - sevenZipPathField.setBounds(170, 45, 300, 25); - this.add(sevenZipPathField); - - sevenZipSearchButton = new JButton("Search"); - sevenZipSearchButton.setBounds(480, 45, 80, 25); - - sevenZipSearchButton.addActionListener(e -> { - - JFileChooser fc = new JFileChooser(); - if (fc.showOpenDialog(new JPanel()) == JFileChooser.APPROVE_OPTION) { - sevenZipPathField.setText(fc.getSelectedFile().getPath()); - } - }); - - this.add(sevenZipSearchButton); - - sevenZipEnableBox.addActionListener(e -> { - boolean flag = !controller.is7ZipEnabled(); - controller.setEnable7Zip(flag); - - sevenZipLabel.setEnabled(flag); - sevenZipPathField.setEnabled(flag); - sevenZipSearchButton.setEnabled(flag); - }); - - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { - validate7ZipSettings(); - } - - /** - * Validates the 7Zip settings - */ - private void validate7ZipSettings() { - boolean flag = controller.is7ZipEnabled(); - - sevenZipEnableBox.setSelected(flag); - sevenZipLabel.setEnabled(flag); - sevenZipPathField.setEnabled(flag); - sevenZipSearchButton.setEnabled(flag); - } - - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { - - int w = 550, h = 210; - - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; - - // 10, 10 <-> 580, 185 - executablePathLabel.setLocation(x, y); - - sevenZipEnableBox.setLocation(x, y + 35); - sevenZipLabel.setLocation(x + 30, y + 35); - sevenZipPathField.setLocation(x + 160, y + 35); - sevenZipSearchButton.setLocation(x + 470, y + 35); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { - Object o = config - .getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); - if (o != null) { - controller.setEnable7Zip(true); - sevenZipPathField.setText((String) o); - } else { - controller.setEnable7Zip(false); - sevenZipPathField.setText(""); + /** + * Validates the 7Zip settings + */ + private void validate7ZipSettings() + { + boolean flag = controller.is7ZipEnabled(); + + sevenZipEnableBox.setSelected(flag); + sevenZipLabel.setEnabled(flag); + sevenZipPathField.setEnabled(flag); + sevenZipSearchButton.setEnabled(flag); } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - boolean sevenzip = controller.is7ZipEnabled(); - - if (sevenzip) { - - String cmd; - builder.append("\t\r\n"); - - if (sevenzip) { - cmd = sevenZipPathField.getText(); - if (cmd.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.PATH_NOT_SET, - "The path to the 7Zip executable" + " is missing.")); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { + + int w = 550, h = 210; + + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; + + // 10, 10 <-> 580, 185 + executablePathLabel.setLocation(x, y); + + sevenZipEnableBox.setLocation(x, y + 35); + sevenZipLabel.setLocation(x + 30, y + 35); + sevenZipPathField.setLocation(x + 160, y + 35); + sevenZipSearchButton.setLocation(x + 470, y + 35); + + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + Object o = config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); + if (o != null) { + controller.setEnable7Zip(true); + sevenZipPathField.setText((String) o); + } + else { + controller.setEnable7Zip(false); + sevenZipPathField.setText(""); } - builder.append("\t\t\"" + cmd + "\"\r\n"); - } + } - builder.append("\t\r\n"); + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + + boolean sevenzip = controller.is7ZipEnabled(); + + if (sevenzip) { + + String cmd; + builder.append("\t\r\n"); + + if (sevenzip) { + cmd = sevenZipPathField.getText(); + if (cmd.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.PATH_NOT_SET, + "The path to the 7Zip executable" + " is missing.")); + } + + builder.append("\t\t\"" + cmd + "\"\r\n"); + } + + builder.append("\t\r\n"); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java index 85aac50a..5304268a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/FilterPanel.java @@ -35,205 +35,208 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the filtering. + * This panel contains all components for setting configuration parameters related to the filtering. */ @SuppressWarnings("serial") public class FilterPanel - extends AbstractPanel { - // table with namespaces to filter - private JTable namespaces; - - /** - * (Constructor) Creates a new SurrogatePanel - * - * @param controller Reference to the controller - */ - public FilterPanel(ConfigController controller) { - super(controller); - - controller.register(PanelKeys.PANEL_FILTER, this); - - initTable(); - - initButtons(); + extends AbstractPanel +{ + // table with namespaces to filter + private JTable namespaces; + + /** + * (Constructor) Creates a new SurrogatePanel + * + * @param controller + * Reference to the controller + */ + public FilterPanel(ConfigController controller) + { + super(controller); + + controller.register(PanelKeys.PANEL_FILTER, this); + + initTable(); + + initButtons(); + + // init label + JLabel hint = new JLabel(); + hint.setText("If nothing is selected,
all namespaces are allowed."); + hint.setBounds(385, 70, 180, 60); + this.add(hint); + } - // init label - JLabel hint = new JLabel(); - hint.setText("If nothing is selected,
all namespaces are allowed."); - hint.setBounds(385, 70, 180, 60); - this.add(hint); - } + /** + * Initialize JTable that contains namespaces + */ + private void initTable() + { + namespaces = new JTable(new FilterTableModel()); - /** - * Initialize JTable that contains namespaces - */ - private void initTable() { - namespaces = new JTable(new FilterTableModel()); + namespaces.removeColumn(namespaces.getColumn("#")); - namespaces.removeColumn(namespaces.getColumn("#")); + namespaces.setFillsViewportHeight(true); + namespaces.setPreferredScrollableViewportSize(new Dimension(500, 70)); - namespaces.setFillsViewportHeight(true); - namespaces.setPreferredScrollableViewportSize(new Dimension(500, 70)); + // Create the scroll pane and add the table to it. + JScrollPane scrollPane = new JScrollPane(namespaces); - // Create the scroll pane and add the table to it. - JScrollPane scrollPane = new JScrollPane(namespaces); + scrollPane.setBounds(70, 10, 300, 200); + this.add(scrollPane); + } - scrollPane.setBounds(70, 10, 300, 200); - this.add(scrollPane); - } + /** + * Initialize two buttons: SelectAll and UnselectAll + */ + private void initButtons() + { + JButton selectAll = new JButton("Select all"); + selectAll.addActionListener(arg0 -> { + for (int i = 0; i < 22; i++) { + namespaces.getModel().setValueAt(true, i, 1); + } - /** - * Initialize two buttons: SelectAll and UnselectAll - */ - private void initButtons() { - JButton selectAll = new JButton("Select all"); - selectAll.addActionListener(arg0 -> { - for (int i = 0; i < 22; i++) { - namespaces.getModel().setValueAt(true, i, 1); - } + }); + selectAll.setBounds(380, 10, 120, 25); + this.add(selectAll); - }); - selectAll.setBounds(380, 10, 120, 25); - this.add(selectAll); + JButton unselectAll = new JButton("Unselect all"); - JButton unselectAll = new JButton("Unselect all"); + unselectAll.addActionListener(e -> { + for (int i = 0; i < 22; i++) { + namespaces.getModel().setValueAt(false, i, 1); + } - unselectAll.addActionListener(e -> { - for (int i = 0; i < 22; i++) { - namespaces.getModel().setValueAt(false, i, 1); - } + }); - }); + unselectAll.setBounds(380, 40, 120, 25); + this.add(unselectAll); + } - unselectAll.setBounds(380, 40, 120, 25); - this.add(unselectAll); - } + @Override + public void validate() + { - @Override - public void validate() { + } - } + @Override + public void relocate() + { - @Override - public void relocate() { + } - } + @Override + public void toXML(StringBuilder builder, ConfigVerification errors) + { + builder.append("\t\r\n"); + builder.append("\t\t\r\n"); + int rows = this.namespaces.getModel().getRowCount(); + for (int j = 0; j < rows; j++) { + + if (this.namespaces.getModel().getValueAt(j, 1).equals(true)) { + builder.append("\t\t\t"); + builder.append(this.namespaces.getModel().getValueAt(j, 2)); + builder.append("\r\n"); + } - @Override - public void toXML(StringBuilder builder, ConfigVerification errors) { - builder.append("\t\r\n"); - builder.append("\t\t\r\n"); - int rows = this.namespaces.getModel().getRowCount(); - for (int j = 0; j < rows; j++) { + } - if (this.namespaces.getModel().getValueAt(j, 1).equals(true)) { - builder.append("\t\t\t"); - builder.append(this.namespaces.getModel().getValueAt(j, 2)); - builder.append("\r\n"); - } + builder.append("\t\t\r\n"); + builder.append("\t\r\n"); } - builder.append("\t\t\r\n"); - builder.append("\t\r\n"); + @Override + public void applyConfig(ConfigSettings config) + { + @SuppressWarnings("unchecked") + Set namespaces = (Set) config + .getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP); - } + if (namespaces != null) { - @Override - public void applyConfig(ConfigSettings config) { - @SuppressWarnings("unchecked") - Set namespaces = (Set) config - .getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP); + int rows = this.namespaces.getModel().getRowCount(); + for (int j = 0; j < rows; j++) { + if (namespaces.contains((this.namespaces.getModel().getValueAt(j, 2)))) { + this.namespaces.getModel().setValueAt(true, j, 1); + } + else { + this.namespaces.getModel().setValueAt(false, j, 1); + } - if (namespaces != null) { + } - int rows = this.namespaces.getModel().getRowCount(); - for (int j = 0; j < rows; j++) { - if (namespaces.contains((this.namespaces.getModel().getValueAt( - j, 2)))) { - this.namespaces.getModel().setValueAt(true, j, - 1); - } else { - this.namespaces.getModel().setValueAt(false, - j, 1); } - } - } - } - - /** - * Custom model for JTable that contains a list of namespaces to filter - */ - class FilterTableModel - extends AbstractTableModel { - private final String[] columnNames = {"Namespace", "Allow", "#"}; - - private final Object[][] data = {{"main(0)", false, 0}, - {"talk(1)", false, 1}, - {"user(2)", false, 2}, - {"user talk(3)", false, 3}, - {"wikipedia(4)", false, 4}, - {"wikipedia talk(5)", false, 5}, - {"file(6)", false, 6}, - {"file talk(7)", false, 7}, - {"mediawiki(8)", false, 8}, - {"mediawiki talk(9)", false, 9}, - {"template(10)", false, 10}, - {"template talk(11)", false, 11}, - {"help(12)", false, 12}, - {"help talk(13)", false, 13}, - {"category(14)", false, 14}, - {"category talk(15)", false, 15}, - {"portal(100)", false, 100}, - {"portal talk(101)", false, 101}, - {"book(108)", false, 108}, - {"book talk(109)", false, 109}, - {"special(-1)", false, -1}, - {"media(-2)", false, -2} - - }; + /** + * Custom model for JTable that contains a list of namespaces to filter + */ + class FilterTableModel + extends AbstractTableModel + { + private final String[] columnNames = { "Namespace", "Allow", "#" }; + + private final Object[][] data = { { "main(0)", false, 0 }, { "talk(1)", false, 1 }, + { "user(2)", false, 2 }, { "user talk(3)", false, 3 }, { "wikipedia(4)", false, 4 }, + { "wikipedia talk(5)", false, 5 }, { "file(6)", false, 6 }, + { "file talk(7)", false, 7 }, { "mediawiki(8)", false, 8 }, + { "mediawiki talk(9)", false, 9 }, { "template(10)", false, 10 }, + { "template talk(11)", false, 11 }, { "help(12)", false, 12 }, + { "help talk(13)", false, 13 }, { "category(14)", false, 14 }, + { "category talk(15)", false, 15 }, { "portal(100)", false, 100 }, + { "portal talk(101)", false, 101 }, { "book(108)", false, 108 }, + { "book talk(109)", false, 109 }, { "special(-1)", false, -1 }, + { "media(-2)", false, -2 } + + }; + + @Override + public int getColumnCount() + { + return columnNames.length; + } - @Override - public int getColumnCount() { - return columnNames.length; - } + @Override + public int getRowCount() + { + return data.length; + } - @Override - public int getRowCount() { - return data.length; - } + @Override + public String getColumnName(int col) + { + return columnNames[col]; + } - @Override - public String getColumnName(int col) { - return columnNames[col]; - } + @Override + public Object getValueAt(int row, int col) + { + return data[row][col]; + } - @Override - public Object getValueAt(int row, int col) { - return data[row][col]; - } + @SuppressWarnings({ "unchecked", "rawtypes" }) + @Override + public Class getColumnClass(int c) + { + return getValueAt(0, c).getClass(); + } - @SuppressWarnings({"unchecked", "rawtypes"}) - @Override - public Class getColumnClass(int c) { - return getValueAt(0, c).getClass(); - } + @Override + public boolean isCellEditable(int row, int col) + { + return true; + } - @Override - public boolean isCellEditable(int row, int col) { - return true; - } + @Override + public void setValueAt(Object value, int row, int col) + { + data[row][col] = value; + fireTableCellUpdated(row, col); + } - @Override - public void setValueAt(Object value, int row, int col) { - data[row][col] = value; - fireTableCellUpdated(row, col); } - } - } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java index 17ebcdbd..1cb81245 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/InputPanel.java @@ -43,448 +43,449 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the input data. + * This panel contains all components for setting configuration parameters related to the input + * data. */ @SuppressWarnings("serial") public class InputPanel - extends AbstractPanel { - - /** - * Subpanel of the InputPanel - *

- * Contains the settings related to the surrogate mode - */ - private class SurrogatePanel - extends AbstractPanel { - - private JLabel surrogateLabel; - private JRadioButton replaceSurrogatesRadioButton; - private JRadioButton faultySurrogatesRadioButton; - private JRadioButton discardSurrogatesRevisionRadioButton; - private JRadioButton discardSurrogatesArticleRadioButton; + extends AbstractPanel +{ /** - * (Constructor) Creates a new SurrogatePanel - * - * @param controller Reference to the controller + * Subpanel of the InputPanel + *

+ * Contains the settings related to the surrogate mode */ - public SurrogatePanel(final ConfigController controller) { - super(controller); - createButtons(); - } + private class SurrogatePanel + extends AbstractPanel + { + + private JLabel surrogateLabel; + private JRadioButton replaceSurrogatesRadioButton; + private JRadioButton faultySurrogatesRadioButton; + private JRadioButton discardSurrogatesRevisionRadioButton; + private JRadioButton discardSurrogatesArticleRadioButton; + + /** + * (Constructor) Creates a new SurrogatePanel + * + * @param controller + * Reference to the controller + */ + public SurrogatePanel(final ConfigController controller) + { + super(controller); + createButtons(); + } - private void createButtons() { - surrogateLabel = new JLabel("Surrogate Characters"); - surrogateLabel.setBounds(10, 10, 130, 25); - this.add(surrogateLabel); + private void createButtons() + { + surrogateLabel = new JLabel("Surrogate Characters"); + surrogateLabel.setBounds(10, 10, 130, 25); + this.add(surrogateLabel); - /* - * DEFAULT MODE - */ - discardSurrogatesRevisionRadioButton = new JRadioButton( - "Discard revision"); - discardSurrogatesRevisionRadioButton.setBounds(10, 90, 120, 25); + /* + * DEFAULT MODE + */ + discardSurrogatesRevisionRadioButton = new JRadioButton("Discard revision"); + discardSurrogatesRevisionRadioButton.setBounds(10, 90, 120, 25); - discardSurrogatesRevisionRadioButton - .addActionListener(e -> { + discardSurrogatesRevisionRadioButton.addActionListener(e -> { if (controller.getSurrogates() != SurrogateModes.DISCARD_REVISION) { - controller - .setSurrogates(SurrogateModes.DISCARD_REVISION); + controller.setSurrogates(SurrogateModes.DISCARD_REVISION); } validateSurrogateSettings(); - }); + }); - // pre-activate default mode - discardSurrogatesRevisionRadioButton.setSelected(true); + // pre-activate default mode + discardSurrogatesRevisionRadioButton.setSelected(true); - this.add(discardSurrogatesRevisionRadioButton); + this.add(discardSurrogatesRevisionRadioButton); - /* - * REPLACE-Mode - */ + /* + * REPLACE-Mode + */ - replaceSurrogatesRadioButton = new JRadioButton("Replace them"); - replaceSurrogatesRadioButton.setBounds(10, 40, 120, 25); + replaceSurrogatesRadioButton = new JRadioButton("Replace them"); + replaceSurrogatesRadioButton.setBounds(10, 40, 120, 25); - replaceSurrogatesRadioButton.addActionListener(e -> { + replaceSurrogatesRadioButton.addActionListener(e -> { - if (controller.getSurrogates() != SurrogateModes.REPLACE) { - controller.setSurrogates(SurrogateModes.REPLACE); - } + if (controller.getSurrogates() != SurrogateModes.REPLACE) { + controller.setSurrogates(SurrogateModes.REPLACE); + } - validateSurrogateSettings(); - }); - this.add(replaceSurrogatesRadioButton); + validateSurrogateSettings(); + }); + this.add(replaceSurrogatesRadioButton); - /* - * THROW_ERROR-Mode - */ + /* + * THROW_ERROR-Mode + */ - faultySurrogatesRadioButton = new JRadioButton("Throw an error"); - faultySurrogatesRadioButton.setBounds(10, 65, 120, 25); + faultySurrogatesRadioButton = new JRadioButton("Throw an error"); + faultySurrogatesRadioButton.setBounds(10, 65, 120, 25); - faultySurrogatesRadioButton.addActionListener(e -> { + faultySurrogatesRadioButton.addActionListener(e -> { - if (controller.getSurrogates() != SurrogateModes.THROW_ERROR) { - controller.setSurrogates(SurrogateModes.THROW_ERROR); - } + if (controller.getSurrogates() != SurrogateModes.THROW_ERROR) { + controller.setSurrogates(SurrogateModes.THROW_ERROR); + } - validateSurrogateSettings(); - }); - this.add(faultySurrogatesRadioButton); + validateSurrogateSettings(); + }); + this.add(faultySurrogatesRadioButton); - /* - * DISCARD_REST-Mode - */ + /* + * DISCARD_REST-Mode + */ - discardSurrogatesArticleRadioButton = new JRadioButton( - "Discard rest"); - discardSurrogatesArticleRadioButton.setBounds(10, 115, 120, 25); + discardSurrogatesArticleRadioButton = new JRadioButton("Discard rest"); + discardSurrogatesArticleRadioButton.setBounds(10, 115, 120, 25); - discardSurrogatesArticleRadioButton - .addActionListener(e -> { + discardSurrogatesArticleRadioButton.addActionListener(e -> { if (controller.getSurrogates() != SurrogateModes.DISCARD_REST) { - controller - .setSurrogates(SurrogateModes.DISCARD_REST); + controller.setSurrogates(SurrogateModes.DISCARD_REST); } validateSurrogateSettings(); - }); - this.add(discardSurrogatesArticleRadioButton); + }); + this.add(discardSurrogatesArticleRadioButton); - } + } - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { - validateSurrogateSettings(); - } + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { + validateSurrogateSettings(); + } - /** - * Validates the surrogate settings. - */ - private void validateSurrogateSettings() { - - - /* - * TODO Uncomment this code as soon as the surrogate modes are reactivated - */ - -// SurrogateModes sur = controller.getSurrogates(); -// -// replaceSurrogatesRadioButton -// .setSelected(sur == SurrogateModes.REPLACE); -// faultySurrogatesRadioButton -// .setSelected(sur == SurrogateModes.THROW_ERROR); -// discardSurrogatesRevisionRadioButton -// .setSelected(sur == SurrogateModes.DISCARD_REVISION); -// discardSurrogatesArticleRadioButton -// .setSelected(sur == SurrogateModes.DISCARD_REST); - - /* - * DEACTIVATE UNSUPPORTED MODES - * TODO: remove config options for - * unsupported surrogates mode. Can be activated again as soon as - * the implementation of these modes have been checked. - * Then also uncomment the original code above. - */ - //BEGIN WORK AROUND FOR DEACTIVATED SURROGATE MODES - faultySurrogatesRadioButton.setEnabled(false); - discardSurrogatesArticleRadioButton.setEnabled(false); - replaceSurrogatesRadioButton.setEnabled(false); - discardSurrogatesRevisionRadioButton.setSelected(true); - //END WORK AROUND FOR DEACTIVATED SURROGATE MODES + /** + * Validates the surrogate settings. + */ + private void validateSurrogateSettings() + { + + /* + * TODO Uncomment this code as soon as the surrogate modes are reactivated + */ + + // SurrogateModes sur = controller.getSurrogates(); + // + // replaceSurrogatesRadioButton + // .setSelected(sur == SurrogateModes.REPLACE); + // faultySurrogatesRadioButton + // .setSelected(sur == SurrogateModes.THROW_ERROR); + // discardSurrogatesRevisionRadioButton + // .setSelected(sur == SurrogateModes.DISCARD_REVISION); + // discardSurrogatesArticleRadioButton + // .setSelected(sur == SurrogateModes.DISCARD_REST); + + /* + * DEACTIVATE UNSUPPORTED MODES TODO: remove config options for unsupported surrogates + * mode. Can be activated again as soon as the implementation of these modes have been + * checked. Then also uncomment the original code above. + */ + // BEGIN WORK AROUND FOR DEACTIVATED SURROGATE MODES + faultySurrogatesRadioButton.setEnabled(false); + discardSurrogatesArticleRadioButton.setEnabled(false); + replaceSurrogatesRadioButton.setEnabled(false); + discardSurrogatesRevisionRadioButton.setSelected(true); + // END WORK AROUND FOR DEACTIVATED SURROGATE MODES + } + + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { + + int w = 120, h = 130; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + surrogateLabel.setLocation(10, 10); + faultySurrogatesRadioButton.setLocation(x, y + 55); + replaceSurrogatesRadioButton.setLocation(x, y + 30); + discardSurrogatesRevisionRadioButton.setLocation(x, y + 80); + discardSurrogatesArticleRadioButton.setLocation(x, y + 105); + } + + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void applyConfig(final ConfigSettings config) + { + throw new UnsupportedOperationException(); + } + /** + * empty method + * + * @throws UnsupportedOperationException + * @deprecated + */ + @Deprecated + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + throw new UnsupportedOperationException(); + } } - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { + // --------------------------------------------------------------------------// + // FIELDS & CONSTRUCTORS // + // --------------------------------------------------------------------------// - int w = 120, h = 130; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + private JTable archiveTable; + private JScrollPane archiveScrollPane; - surrogateLabel.setLocation(10, 10); - faultySurrogatesRadioButton.setLocation(x, y + 55); - replaceSurrogatesRadioButton.setLocation(x, y + 30); - discardSurrogatesRevisionRadioButton.setLocation(x, y + 80); - discardSurrogatesArticleRadioButton.setLocation(x, y + 105); - } + private JButton addArchiveButton; + private JButton removeArchiveButton; - /** - * empty method - * - * @throws UnsupportedOperationException - * @deprecated - */ - @Deprecated - @Override - public void applyConfig(final ConfigSettings config) { - throw new UnsupportedOperationException(); - } + private JLabel encodingLabel; + private JTextField encodingField; + + private SurrogatePanel surrogatePanel; /** - * empty method + * (Constructor) Creates a new InputPanel. * - * @throws UnsupportedOperationException - * @deprecated + * @param controller + * Reference to the controller */ - @Deprecated - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - throw new UnsupportedOperationException(); + public InputPanel(final ConfigController controller) + { + + super(controller); + controller.register(PanelKeys.PANEL_INPUT, this); + + createArchiveTable(); + createControllButtons(); + createEncodingSettings(); + createSurrogateSettings(); } - } - // --------------------------------------------------------------------------// - // FIELDS & CONSTRUCTORS // - // --------------------------------------------------------------------------// + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// - private JTable archiveTable; - private JScrollPane archiveScrollPane; + private void createArchiveTable() + { + archiveTable = new JTable(controller.getArchives()); + archiveTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); - private JButton addArchiveButton; - private JButton removeArchiveButton; + archiveScrollPane = new JScrollPane(archiveTable); + archiveScrollPane.setBounds(10, 10, 410, 210); - private JLabel encodingLabel; - private JTextField encodingField; + this.add(archiveScrollPane); + } - private SurrogatePanel surrogatePanel; + private void createControllButtons() + { + addArchiveButton = new JButton("Add"); + addArchiveButton.setBounds(445, 20, 100, 25); - /** - * (Constructor) Creates a new InputPanel. - * - * @param controller Reference to the controller - */ - public InputPanel(final ConfigController controller) { + addArchiveButton.addActionListener(e -> { - super(controller); - controller.register(PanelKeys.PANEL_INPUT, this); + new InputDialog(controller).setVisible(true); + repaint(); + }); - createArchiveTable(); - createControllButtons(); - createEncodingSettings(); - createSurrogateSettings(); - } + this.add(addArchiveButton); - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + removeArchiveButton = new JButton("Remove"); + removeArchiveButton.setBounds(445, 50, 100, 25); - private void createArchiveTable() { - archiveTable = new JTable(controller.getArchives()); - archiveTable.setSelectionMode(ListSelectionModel.SINGLE_SELECTION); + removeArchiveButton.addActionListener(e -> { - archiveScrollPane = new JScrollPane(archiveTable); - archiveScrollPane.setBounds(10, 10, 410, 210); + controller.removeArchive(archiveTable.getSelectedRow()); + archiveTable.revalidate(); + repaint(); + }); - this.add(archiveScrollPane); - } + this.add(removeArchiveButton); + } - private void createControllButtons() { - addArchiveButton = new JButton("Add"); - addArchiveButton.setBounds(445, 20, 100, 25); + private void createEncodingSettings() + { + encodingLabel = new JLabel("Wikipedia Character Encoding: "); + encodingLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + encodingLabel.setBounds(10, 230, 200, 25); + this.add(encodingLabel); - addArchiveButton.addActionListener(e -> { + encodingField = new JTextField(); + encodingField.setBounds(220, 230, 200, 25); + this.add(encodingField); + } - new InputDialog(controller).setVisible(true); - repaint(); - }); + private void createSurrogateSettings() + { + surrogatePanel = new SurrogatePanel(controller); + surrogatePanel.setBorder(BorderFactory.createLoweredBevelBorder()); + surrogatePanel.setBounds(425, 95, 140, 160); + this.add(surrogatePanel); + } - this.add(addArchiveButton); + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// - removeArchiveButton = new JButton("Remove"); - removeArchiveButton.setBounds(445, 50, 100, 25); + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { - removeArchiveButton.addActionListener(e -> { + this.archiveTable.revalidate(); + this.surrogatePanel.validate(); + } - controller.removeArchive(archiveTable.getSelectedRow()); - archiveTable.revalidate(); - repaint(); - }); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { - this.add(removeArchiveButton); - } + int w = 555, h = 235; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - private void createEncodingSettings() { - encodingLabel = new JLabel("Wikipedia Character Encoding: "); - encodingLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - encodingLabel.setBounds(10, 230, 200, 25); - this.add(encodingLabel); + archiveScrollPane.setLocation(x, y); - encodingField = new JTextField(); - encodingField.setBounds(220, 230, 200, 25); - this.add(encodingField); - } + addArchiveButton.setLocation(x + 435, y + 10); + removeArchiveButton.setLocation(x + 435, y + 40); - private void createSurrogateSettings() { - surrogatePanel = new SurrogatePanel(controller); - surrogatePanel.setBorder(BorderFactory.createLoweredBevelBorder()); - surrogatePanel.setBounds(425, 95, 140, 160); - this.add(surrogatePanel); - } + encodingLabel.setLocation(x, y + 220); + encodingField.setLocation(x + 210, y + 220); - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// + surrogatePanel.setLocation(x + 415, y + 85); - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { + } - this.archiveTable.revalidate(); - this.surrogatePanel.validate(); - } + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { - int w = 555, h = 235; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - archiveScrollPane.setLocation(x, y); - - addArchiveButton.setLocation(x + 435, y + 10); - removeArchiveButton.setLocation(x + 435, y + 40); - - encodingLabel.setLocation(x, y + 220); - encodingField.setLocation(x + 210, y + 220); - - surrogatePanel.setLocation(x + 415, y + 85); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { + Object o = config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + if (o != null) { + encodingField.setText((String) o); + } + else { + encodingField.setText(""); + } - Object o = config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - if (o != null) { - encodingField.setText((String) o); - } else { - encodingField.setText(""); + o = config.getConfigParameter(ConfigurationKeys.MODE_SURROGATES); + if (o != null) { + controller.setSurrogates((SurrogateModes) o); + } + else { + controller.setSurrogates(SurrogateModes.DISCARD_REVISION); + } } - o = config.getConfigParameter(ConfigurationKeys.MODE_SURROGATES); - if (o != null) { - controller.setSurrogates((SurrogateModes) o); - } else { - controller.setSurrogates(SurrogateModes.DISCARD_REVISION); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - SurrogateModes surMode = controller.getSurrogates(); - - String wikiEncoding = encodingField.getText(); - if (wikiEncoding.length() == 0) { - - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.MISSING_VALUE, - "The CharacterEncoding was not set.")); - } + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { - builder.append("\t\r\n"); - builder.append("\t\t" + surMode - + "\r\n"); - builder.append("\t\t" + wikiEncoding - + "\r\n"); + SurrogateModes surMode = controller.getSurrogates(); - ArchiveRegistry reg = controller.getArchives(); + String wikiEncoding = encodingField.getText(); + if (wikiEncoding.length() == 0) { - int size = reg.getRowCount(); + errors.add(new ConfigItem(ConfigItemTypes.WARNING, ConfigErrorKeys.MISSING_VALUE, + "The CharacterEncoding was not set.")); + } - ArchiveDescription archive; - InputType type; - String archivePath; - long start; + builder.append("\t\r\n"); + builder.append("\t\t" + surMode + "\r\n"); + builder.append("\t\t" + wikiEncoding + "\r\n"); - if (size == 0) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.MISSING_VALUE, - "No source file has been set.")); - } + ArchiveRegistry reg = controller.getArchives(); + + int size = reg.getRowCount(); + + ArchiveDescription archive; + InputType type; + String archivePath; + long start; - for (int i = 0; i < size; i++) { - - archive = reg.get(i); - - type = archive.getType(); - switch (type) { - case XML: - break; - case BZIP2: - //bzip is always enabled - nothing to check here - break; - case SEVENZIP: - if (!controller.is7ZipEnabled()) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT_FILE, - "The SevenUip mode is not " + "activated")); - } - break; - } - - archivePath = archive.getPath(); - if (archivePath.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.PATH_NOT_SET, - "The archive path is missing")); - } - - start = archive.getStartPosition(); - if (start < 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The archive start value should be at least 0")); - } - - builder.append("\t\t\r\n"); - builder.append("\t\t\t" + type + "\r\n"); - builder.append("\t\t\t\"" + archivePath + "\"\r\n"); - builder.append("\t\t\t" + start + "\r\n"); - builder.append("\t\t\r\n"); + if (size == 0) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, ConfigErrorKeys.MISSING_VALUE, + "No source file has been set.")); + } + + for (int i = 0; i < size; i++) { + + archive = reg.get(i); + + type = archive.getType(); + switch (type) { + case XML: + break; + case BZIP2: + // bzip is always enabled - nothing to check here + break; + case SEVENZIP: + if (!controller.is7ZipEnabled()) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.ILLEGAL_INPUT_FILE, + "The SevenUip mode is not " + "activated")); + } + break; + } + + archivePath = archive.getPath(); + if (archivePath.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.PATH_NOT_SET, + "The archive path is missing")); + } + + start = archive.getStartPosition(); + if (start < 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The archive start value should be at least 0")); + } + + builder.append("\t\t\r\n"); + builder.append("\t\t\t" + type + "\r\n"); + builder.append("\t\t\t\"" + archivePath + "\"\r\n"); + builder.append("\t\t\t" + start + "\r\n"); + builder.append("\t\t\r\n"); + } + builder.append("\t\r\n"); } - builder.append("\t\r\n"); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java index b9b1c007..3d1f3f07 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/LoggingPanel.java @@ -37,143 +37,144 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the logging. + * This panel contains all components for setting configuration parameters related to the logging. */ @SuppressWarnings("serial") public class LoggingPanel - extends AbstractPanel { - - private JLabel diffToolLabel; - private JTextField diffToolField; - private JComboBox diffToolLogLevelComboBox; - - /** - * (Constructor) Creates a new LoggingPanel. - * - * @param controller Reference to the controller - */ - public LoggingPanel(final ConfigController controller) { - - super(controller); - controller.register(PanelKeys.PANEL_LOGGING, this); - - createDiffToolLoggingSettings(); - } - - private void createDiffToolLoggingSettings() { - diffToolLabel = new JLabel("Logging Root Folder: "); - diffToolLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - diffToolLabel.setBounds(10, 10, 150, 25); - this.add(diffToolLabel); - - diffToolField = new JTextField(); - diffToolField.setBounds(170, 10, 200, 25); - this.add(diffToolField); - - diffToolLogLevelComboBox = new JComboBox<>(); - diffToolLogLevelComboBox.setBounds(390, 10, 100, 25); - - diffToolLogLevelComboBox.addItem(Level.ERROR); - diffToolLogLevelComboBox.addItem(Level.WARN); - diffToolLogLevelComboBox.addItem(Level.INFO); - diffToolLogLevelComboBox.addItem(Level.DEBUG); - diffToolLogLevelComboBox.addItem(Level.TRACE); - - this.add(diffToolLogLevelComboBox); - } - - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { - - } - - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { - - int w = 480, h = 245; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - diffToolLabel.setLocation(x, y); - diffToolField.setLocation(x + 160, y); - diffToolLogLevelComboBox.setLocation(x + 380, y); - - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { - - Object o = config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - if (o != null) { - this.diffToolField.setText((String) o); - } else { - this.diffToolField.setText(""); + extends AbstractPanel +{ + + private JLabel diffToolLabel; + private JTextField diffToolField; + private JComboBox diffToolLogLevelComboBox; + + /** + * (Constructor) Creates a new LoggingPanel. + * + * @param controller + * Reference to the controller + */ + public LoggingPanel(final ConfigController controller) + { + + super(controller); + controller.register(PanelKeys.PANEL_LOGGING, this); + + createDiffToolLoggingSettings(); } - o = config - .getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); - if (o != null) { - this.diffToolLogLevelComboBox.setSelectedItem(o); + private void createDiffToolLoggingSettings() + { + diffToolLabel = new JLabel("Logging Root Folder: "); + diffToolLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + diffToolLabel.setBounds(10, 10, 150, 25); + this.add(diffToolLabel); + + diffToolField = new JTextField(); + diffToolField.setBounds(170, 10, 200, 25); + this.add(diffToolField); + + diffToolLogLevelComboBox = new JComboBox<>(); + diffToolLogLevelComboBox.setBounds(390, 10, 100, 25); + + diffToolLogLevelComboBox.addItem(Level.ERROR); + diffToolLogLevelComboBox.addItem(Level.WARN); + diffToolLogLevelComboBox.addItem(Level.INFO); + diffToolLogLevelComboBox.addItem(Level.DEBUG); + diffToolLogLevelComboBox.addItem(Level.TRACE); + + this.add(diffToolLogLevelComboBox); + } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { + } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - builder.append("\t\r\n"); - - // DIFFTOOL - String pathDiffTool = diffToolField.getText(); - if (pathDiffTool.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.PATH_NOT_SET, - "The root folder for all logs and debug" - + " information has not been set.")); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { + + int w = 480, h = 245; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + diffToolLabel.setLocation(x, y); + diffToolField.setLocation(x + 160, y); + diffToolLogLevelComboBox.setLocation(x + 380, y); + } - if (!pathDiffTool.endsWith(File.separator) - && pathDiffTool.contains(File.separator)) { - pathDiffTool += File.separator; + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + + Object o = config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + if (o != null) { + this.diffToolField.setText((String) o); + } + else { + this.diffToolField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.LOGGING_LOGLEVEL_DIFFTOOL); + if (o != null) { + this.diffToolLogLevelComboBox.setSelectedItem(o); + } + } - builder.append("\t\t\"").append(pathDiffTool).append("\"\r\n"); - builder.append("\t\t\r\n"); - builder.append("\t\t\t").append(diffToolLogLevelComboBox.getSelectedItem()).append("\r\n"); - builder.append("\t\t\r\n"); - builder.append("\t\r\n"); - } + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + + builder.append("\t\r\n"); + + // DIFFTOOL + String pathDiffTool = diffToolField.getText(); + if (pathDiffTool.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, ConfigErrorKeys.PATH_NOT_SET, + "The root folder for all logs and debug" + " information has not been set.")); + } + if (!pathDiffTool.endsWith(File.separator) && pathDiffTool.contains(File.separator)) { + pathDiffTool += File.separator; + } + + builder.append("\t\t\"").append(pathDiffTool).append("\"\r\n"); + builder.append("\t\t\r\n"); + builder.append("\t\t\t").append(diffToolLogLevelComboBox.getSelectedItem()) + .append("\r\n"); + builder.append("\t\t\r\n"); + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java index 279125d3..9c7a00a6 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/ModePanel.java @@ -33,200 +33,199 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the diff calculation. + * This panel contains all components for setting configuration parameters related to the diff + * calculation. */ @SuppressWarnings("serial") public class ModePanel - extends AbstractPanel { - - private JLabel fullRevisionLabel; - private JTextField fullRevisionField; - - private JLabel minimumCommonSequenceLabel; - private JTextField minimumCommonSequenceField; - - /** - * (Constructor) Creates a new ModePanel. - * - * @param controller Reference to the controller - */ - public ModePanel(final ConfigController controller) { - - super(controller); - controller.register(PanelKeys.PANEL_VALUES, this); - - createFullRevisionSettings(); - createMinimumCommonSequenceSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private void createFullRevisionSettings() { - - fullRevisionLabel = new JLabel( - "Every n-th revision will be a full revision:"); - fullRevisionLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - fullRevisionLabel.setBounds(10, 10, 270, 25); - this.add(fullRevisionLabel); - - fullRevisionField = new JTextField(); - fullRevisionField.setBounds(290, 10, 100, 25); - this.add(fullRevisionField); - } - - private void createMinimumCommonSequenceSettings() { - - minimumCommonSequenceLabel = new JLabel( - "Min lenght of a common subsequence:"); - minimumCommonSequenceLabel.setBorder(BorderFactory - .createRaisedBevelBorder()); - minimumCommonSequenceLabel.setBounds(10, 50, 270, 25); - this.add(minimumCommonSequenceLabel); - - minimumCommonSequenceField = new JTextField(); - minimumCommonSequenceField.setBounds(290, 50, 100, 25); - this.add(minimumCommonSequenceField); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * empty method - */ - @Override - public void validate() { - - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { - - int w = 380, h = 65; - int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; - - fullRevisionLabel.setLocation(x, y); - fullRevisionField.setLocation(x + 280, y); - - minimumCommonSequenceLabel.setLocation(x, y + 40); - minimumCommonSequenceField.setLocation(x + 280, y + 40); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { - Object o = config - .getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); - if (o != null) { - this.fullRevisionField.setText(Integer.toString((Integer) o)); - } else { - this.fullRevisionField.setText(""); + extends AbstractPanel +{ + + private JLabel fullRevisionLabel; + private JTextField fullRevisionField; + + private JLabel minimumCommonSequenceLabel; + private JTextField minimumCommonSequenceField; + + /** + * (Constructor) Creates a new ModePanel. + * + * @param controller + * Reference to the controller + */ + public ModePanel(final ConfigController controller) + { + + super(controller); + controller.register(PanelKeys.PANEL_VALUES, this); + + createFullRevisionSettings(); + createMinimumCommonSequenceSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private void createFullRevisionSettings() + { + + fullRevisionLabel = new JLabel("Every n-th revision will be a full revision:"); + fullRevisionLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + fullRevisionLabel.setBounds(10, 10, 270, 25); + this.add(fullRevisionLabel); + + fullRevisionField = new JTextField(); + fullRevisionField.setBounds(290, 10, 100, 25); + this.add(fullRevisionField); } - o = config - .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); - if (o != null) { - this.minimumCommonSequenceField.setText(Integer - .toString((Integer) o)); - } else { - this.minimumCommonSequenceField.setText(""); + private void createMinimumCommonSequenceSettings() + { + + minimumCommonSequenceLabel = new JLabel("Min lenght of a common subsequence:"); + minimumCommonSequenceLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + minimumCommonSequenceLabel.setBounds(10, 50, 270, 25); + this.add(minimumCommonSequenceLabel); + + minimumCommonSequenceField = new JTextField(); + minimumCommonSequenceField.setBounds(290, 50, 100, 25); + this.add(minimumCommonSequenceField); } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(StringBuilder builder, final ConfigVerification errors) { - - int minLCS = -1, fullRevCounter = -1; - - // Check the FullRevisionCounter input - String text = this.minimumCommonSequenceField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, "The value for minimum " - + "LongestCommonSubsequence is missing.")); - } else { - try { - minLCS = Integer.parseInt(text); - if (minLCS < 7) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The value of the minimum " - + " LongestCommonSubsequence has to be" - + " at least 7.")); - } else if (minLCS < 12) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "A value smaller than 12 for the " - + "minimum LongestCommonSubsequence" - + " is not recommended.")); - } - } catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for " - + "ArticleProducer TaskLimit")); - } + + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * empty method + */ + @Override + public void validate() + { + } - // Check the FullRevisionCounter input - text = this.fullRevisionField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The value for FullRevision Counter" + " is missing.")); - } else { - try { - fullRevCounter = Integer.parseInt(text); - if (fullRevCounter < 1) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The FullRevision Counter has to " - + "be at least 1.")); - } else if (fullRevCounter < 100) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "A FullRevision Counter with a" - + " value smaller than 100 is not" - + " recommended.")); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { + + int w = 380, h = 65; + int x = (this.getWidth() - w) / 2, y = (this.getHeight() - h) / 2; + + fullRevisionLabel.setLocation(x, y); + fullRevisionField.setLocation(x + 280, y); + + minimumCommonSequenceLabel.setLocation(x, y + 40); + minimumCommonSequenceField.setLocation(x + 280, y + 40); + } + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + Object o = config.getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); + if (o != null) { + this.fullRevisionField.setText(Integer.toString((Integer) o)); + } + else { + this.fullRevisionField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); + if (o != null) { + this.minimumCommonSequenceField.setText(Integer.toString((Integer) o)); + } + else { + this.minimumCommonSequenceField.setText(""); } - } catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for " - + "ArticleProducer TaskLimit")); - } } - builder.append("\t\r\n"); - builder.append("\t\t" + minLCS - + "\r\n"); - builder.append("\t\t" + fullRevCounter - + "\r\n"); - builder.append("\t\r\n"); - } + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(StringBuilder builder, final ConfigVerification errors) + { + + int minLCS = -1, fullRevCounter = -1; + + // Check the FullRevisionCounter input + String text = this.minimumCommonSequenceField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The value for minimum " + "LongestCommonSubsequence is missing.")); + } + else { + try { + minLCS = Integer.parseInt(text); + if (minLCS < 7) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, "The value of the minimum " + + " LongestCommonSubsequence has to be" + " at least 7.")); + } + else if (minLCS < 12) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, "A value smaller than 12 for the " + + "minimum LongestCommonSubsequence" + " is not recommended.")); + } + } + catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for " + "ArticleProducer TaskLimit")); + } + } + + // Check the FullRevisionCounter input + text = this.fullRevisionField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The value for FullRevision Counter" + " is missing.")); + } + else { + try { + fullRevCounter = Integer.parseInt(text); + if (fullRevCounter < 1) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The FullRevision Counter has to " + "be at least 1.")); + } + else if (fullRevCounter < 100) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, "A FullRevision Counter with a" + + " value smaller than 100 is not" + " recommended.")); + } + } + catch (NumberFormatException nfe) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for " + "ArticleProducer TaskLimit")); + } + } + + builder.append("\t\r\n"); + builder.append("\t\t" + minLCS + + "\r\n"); + builder.append( + "\t\t" + fullRevCounter + "\r\n"); + builder.append("\t\r\n"); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java index 84598750..6fbc634d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/OutputPanel.java @@ -39,420 +39,417 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the file output. + * This panel contains all components for setting configuration parameters related to the file + * output. */ @SuppressWarnings("serial") public class OutputPanel - extends AbstractPanel { - - private JLabel outputLabel; - private JTextField outputPathField; - - private JCheckBox enableZipEncodingCompression; - private JCheckBox activateDataFileOutput; - private JLabel outputCompression; - private JRadioButton disableOutputCompression; - private JRadioButton enable7ZipOutputCompression; - private JRadioButton enableBZip2OutputCompression; + extends AbstractPanel +{ + + private JLabel outputLabel; + private JTextField outputPathField; + + private JCheckBox enableZipEncodingCompression; + private JCheckBox activateDataFileOutput; + private JLabel outputCompression; + private JRadioButton disableOutputCompression; + private JRadioButton enable7ZipOutputCompression; + private JRadioButton enableBZip2OutputCompression; + + private JCheckBox enableMultipleOutputFiles; + private JLabel outputSizeLimitLabel; + private JTextField outputSizeLimitField; + + /** + * (Constructor) Create the OutputPanel object. + * + * @param controller + * Reference to the controller + */ + public OutputPanel(final ConfigController controller) + { + + super(controller); + controller.register(PanelKeys.PANEL_OUTPUT, this); + + createOutputPathSettings(); + createOutputSizeSettings(); + createOutputSettings(); + } - private JCheckBox enableMultipleOutputFiles; - private JLabel outputSizeLimitLabel; - private JTextField outputSizeLimitField; + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// - /** - * (Constructor) Create the OutputPanel object. - * - * @param controller Reference to the controller - */ - public OutputPanel(final ConfigController controller) { + private void createOutputPathSettings() + { - super(controller); - controller.register(PanelKeys.PANEL_OUTPUT, this); + outputLabel = new JLabel("Output Folder: "); + outputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + outputLabel.setBounds(10, 10, 150, 25); + this.add(outputLabel); - createOutputPathSettings(); - createOutputSizeSettings(); - createOutputSettings(); - } + outputPathField = new JTextField(); + outputPathField.setBounds(170, 10, 200, 25); + this.add(outputPathField); + } - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// + private void createOutputSettings() + { - private void createOutputPathSettings() { + enableZipEncodingCompression = new JCheckBox("Activate Zip Encoding"); + enableZipEncodingCompression.setBounds(120, 50, 150, 25); - outputLabel = new JLabel("Output Folder: "); - outputLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - outputLabel.setBounds(10, 10, 150, 25); - this.add(outputLabel); + enableZipEncodingCompression.addActionListener(e -> { - outputPathField = new JTextField(); - outputPathField.setBounds(170, 10, 200, 25); - this.add(outputPathField); - } + boolean flag = !controller.isZipCompressionEnabled(); + controller.setEnableZipCompression(flag); - private void createOutputSettings() { + validate(); + }); - enableZipEncodingCompression = new JCheckBox("Activate Zip Encoding"); - enableZipEncodingCompression.setBounds(120, 50, 150, 25); + this.add(enableZipEncodingCompression); - enableZipEncodingCompression.addActionListener(e -> { + outputCompression = new JLabel("Output Compression:"); + outputCompression.setBounds(120, 85, 250, 25); + this.add(outputCompression); - boolean flag = !controller.isZipCompressionEnabled(); - controller.setEnableZipCompression(flag); + disableOutputCompression = new JRadioButton("None"); + disableOutputCompression.setBounds(120, 110, 250, 20); + this.add(disableOutputCompression); - validate(); - }); + disableOutputCompression.addActionListener(e -> { - this.add(enableZipEncodingCompression); + OutputCompressionEnum oce = controller.getOutputCompression(); + if (oce != OutputCompressionEnum.None) { + controller.setOutputCompression(OutputCompressionEnum.None); + } - outputCompression = new JLabel("Output Compression:"); - outputCompression.setBounds(120, 85, 250, 25); - this.add(outputCompression); + validate(); + }); - disableOutputCompression = new JRadioButton("None"); - disableOutputCompression.setBounds(120, 110, 250, 20); - this.add(disableOutputCompression); + enable7ZipOutputCompression = new JRadioButton("7Zip Compression"); + enable7ZipOutputCompression.setBounds(120, 130, 250, 20); + this.add(enable7ZipOutputCompression); - disableOutputCompression.addActionListener(e -> { + enable7ZipOutputCompression.addActionListener(e -> { - OutputCompressionEnum oce = controller.getOutputCompression(); - if (oce != OutputCompressionEnum.None) { - controller.setOutputCompression(OutputCompressionEnum.None); - } + OutputCompressionEnum oce = controller.getOutputCompression(); + if (oce != OutputCompressionEnum.SevenZip) { + controller.setOutputCompression(OutputCompressionEnum.SevenZip); + } - validate(); - }); + validate(); + }); - enable7ZipOutputCompression = new JRadioButton("7Zip Compression"); - enable7ZipOutputCompression.setBounds(120, 130, 250, 20); - this.add(enable7ZipOutputCompression); + enableBZip2OutputCompression = new JRadioButton("BZip2 Compression"); + enableBZip2OutputCompression.setBounds(120, 150, 250, 20); + this.add(enableBZip2OutputCompression); - enable7ZipOutputCompression.addActionListener(e -> { + enableBZip2OutputCompression.addActionListener(e -> { - OutputCompressionEnum oce = controller.getOutputCompression(); - if (oce != OutputCompressionEnum.SevenZip) { - controller - .setOutputCompression(OutputCompressionEnum.SevenZip); - } + OutputCompressionEnum oce = controller.getOutputCompression(); + if (oce != OutputCompressionEnum.BZip2) { + controller.setOutputCompression(OutputCompressionEnum.BZip2); + } - validate(); - }); + validate(); + }); - enableBZip2OutputCompression = new JRadioButton("BZip2 Compression"); - enableBZip2OutputCompression.setBounds(120, 150, 250, 20); - this.add(enableBZip2OutputCompression); + activateDataFileOutput = new JCheckBox("DataFile Output"); + activateDataFileOutput.setBounds(120, 50, 170, 25); + activateDataFileOutput.setVisible(true); + activateDataFileOutput.addActionListener(e -> { - enableBZip2OutputCompression.addActionListener(e -> { + boolean flag = !controller.isEnableDataFileOutput(); + controller.setEnableDataFileOutput(flag); - OutputCompressionEnum oce = controller.getOutputCompression(); - if (oce != OutputCompressionEnum.BZip2) { - controller - .setOutputCompression(OutputCompressionEnum.BZip2); - } + validate(); + }); + this.add(activateDataFileOutput); - validate(); - }); + } + private void createOutputSizeSettings() + { - activateDataFileOutput = new JCheckBox("DataFile Output"); - activateDataFileOutput.setBounds(120, 50, 170, 25); - activateDataFileOutput.setVisible(true); - activateDataFileOutput.addActionListener(e -> { + enableMultipleOutputFiles = new JCheckBox("Allow multiple output files per consumer"); + enableMultipleOutputFiles.setBounds(10, 200, 250, 25); + this.add(enableMultipleOutputFiles); - boolean flag = !controller.isEnableDataFileOutput(); - controller.setEnableDataFileOutput(flag); + outputSizeLimitLabel = new JLabel("File Size Limit (in byte): "); + outputSizeLimitLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + outputSizeLimitLabel.setBounds(10, 230, 150, 25); + this.add(outputSizeLimitLabel); - validate(); - }); - this.add(activateDataFileOutput); + outputSizeLimitField = new JTextField(); + outputSizeLimitField.setBounds(170, 230, 200, 25); + this.add(outputSizeLimitField); - } + enableMultipleOutputFiles.addActionListener(e -> { - private void createOutputSizeSettings() { + boolean flag = !controller.isMultipleOutputFiles(); + controller.setMultipleOutputFiles(flag); - enableMultipleOutputFiles = new JCheckBox( - "Allow multiple output files per consumer"); - enableMultipleOutputFiles.setBounds(10, 200, 250, 25); - this.add(enableMultipleOutputFiles); + outputSizeLimitLabel.setEnabled(flag); + outputSizeLimitField.setEnabled(flag); + }); + } - outputSizeLimitLabel = new JLabel("File Size Limit (in byte): "); - outputSizeLimitLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - outputSizeLimitLabel.setBounds(10, 230, 150, 25); - this.add(outputSizeLimitLabel); + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// - outputSizeLimitField = new JTextField(); - outputSizeLimitField.setBounds(170, 230, 200, 25); - this.add(outputSizeLimitField); + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { - enableMultipleOutputFiles.addActionListener(e -> { + boolean flagA = !controller.isEnableSQLDatabaseOutput(); + boolean flagB = controller.isMultipleOutputFiles(); - boolean flag = !controller.isMultipleOutputFiles(); - controller.setMultipleOutputFiles(flag); + OutputCompressionEnum oce = controller.getOutputCompression(); - outputSizeLimitLabel.setEnabled(flag); - outputSizeLimitField.setEnabled(flag); - }); - } + enableZipEncodingCompression.setSelected(controller.isZipCompressionEnabled()); - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// + disableOutputCompression.setSelected(oce == OutputCompressionEnum.None); - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { + enableBZip2OutputCompression.setSelected(oce == OutputCompressionEnum.BZip2); - boolean flagA = !controller.isEnableSQLDatabaseOutput(); - boolean flagB = controller.isMultipleOutputFiles(); + activateDataFileOutput.setSelected(controller.isEnableDataFileOutput()); - OutputCompressionEnum oce = controller.getOutputCompression(); + outputLabel.setEnabled(flagA); + outputPathField.setEnabled(flagA); - enableZipEncodingCompression.setSelected(controller - .isZipCompressionEnabled()); + enableZipEncodingCompression.setEnabled(flagA); - disableOutputCompression.setSelected(oce == OutputCompressionEnum.None); + outputCompression.setEnabled(flagA); + disableOutputCompression.setEnabled(flagA); - enableBZip2OutputCompression - .setSelected(oce == OutputCompressionEnum.BZip2); + enable7ZipOutputCompression.setEnabled(flagA && controller.is7ZipEnabled()); - activateDataFileOutput.setSelected(controller.isEnableDataFileOutput()); + enable7ZipOutputCompression.setSelected(oce == OutputCompressionEnum.SevenZip); - outputLabel.setEnabled(flagA); - outputPathField.setEnabled(flagA); + enableBZip2OutputCompression.setEnabled(flagA); - enableZipEncodingCompression.setEnabled(flagA); + // Enable multiple output files only for uncompressed output + enableMultipleOutputFiles.setEnabled(flagA && (oce == OutputCompressionEnum.None)); + enableMultipleOutputFiles.setSelected(flagB); - outputCompression.setEnabled(flagA); - disableOutputCompression.setEnabled(flagA); + outputSizeLimitLabel.setEnabled(flagA && flagB && (oce == OutputCompressionEnum.None)); + outputSizeLimitField.setEnabled(flagA && flagB && (oce == OutputCompressionEnum.None)); - enable7ZipOutputCompression.setEnabled(flagA - && controller.is7ZipEnabled()); + } - enable7ZipOutputCompression - .setSelected(oce == OutputCompressionEnum.SevenZip); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { - enableBZip2OutputCompression.setEnabled(flagA); + int w = 360, h = 245; - //Enable multiple output files only for uncompressed output - enableMultipleOutputFiles.setEnabled(flagA && (oce == OutputCompressionEnum.None)); - enableMultipleOutputFiles.setSelected(flagB); + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; - outputSizeLimitLabel.setEnabled(flagA && flagB && (oce == OutputCompressionEnum.None)); - outputSizeLimitField.setEnabled(flagA && flagB && (oce == OutputCompressionEnum.None)); + outputLabel.setLocation(x, y); + outputPathField.setLocation(x + 160, y); + enableZipEncodingCompression.setLocation(x + 110, y + 40); + outputCompression.setLocation(x + 110, y + 75); + disableOutputCompression.setLocation(x + 110, y + 100); + enableBZip2OutputCompression.setLocation(x + 110, y + 120); + enable7ZipOutputCompression.setLocation(x + 110, y + 140); + activateDataFileOutput.setLocation(x + 110, y + 160); - } + enableMultipleOutputFiles.setLocation(x, y + 190); + outputSizeLimitLabel.setLocation(x, y + 220); + outputSizeLimitField.setLocation(x + 160, y + 220); - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { + } - int w = 360, h = 245; + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + + Object o = config.getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + if (o != null) { + this.outputPathField.setText((String) o); + } + else { + this.outputPathField.setText(""); + } - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; + o = config.getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); + if (o != null) { + controller.setEnableZipCompression((Boolean) o); + } + else { + controller.setEnableZipCompression(false); + } - outputLabel.setLocation(x, y); - outputPathField.setLocation(x + 160, y); + o = config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); + if (o != null) { + controller.setEnableDataFileOutput((Boolean) o); + } + else { + controller.setEnableDataFileOutput(false); + } - enableZipEncodingCompression.setLocation(x + 110, y + 40); - outputCompression.setLocation(x + 110, y + 75); - disableOutputCompression.setLocation(x + 110, y + 100); - enableBZip2OutputCompression.setLocation(x + 110, y + 120); - enable7ZipOutputCompression.setLocation(x + 110, y + 140); - activateDataFileOutput.setLocation(x + 110, y + 160); + o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + if (o != null) { + switch ((OutputType) o) { + case UNCOMPRESSED: + controller.setEnableSQLDatabaseOutput(false); + controller.setOutputCompression(OutputCompressionEnum.None); + + o = config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); + break; + case SEVENZIP: + controller.setEnableSQLDatabaseOutput(false); + controller.setOutputCompression(OutputCompressionEnum.SevenZip); + + o = config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + break; + case BZIP2: + controller.setEnableSQLDatabaseOutput(false); + controller.setOutputCompression(OutputCompressionEnum.BZip2); + + o = config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + break; + case DATABASE: + controller.setEnableSQLDatabaseOutput(true); + controller.setOutputCompression(OutputCompressionEnum.None); + + o = null; + break; + } + } - enableMultipleOutputFiles.setLocation(x, y + 190); - outputSizeLimitLabel.setLocation(x, y + 220); - outputSizeLimitField.setLocation(x + 160, y + 220); + if (o != null) { + controller.setMultipleOutputFiles(true); + this.outputSizeLimitField.setText(Long.toString((Long) o)); + } + else { + controller.setMultipleOutputFiles(false); + this.outputSizeLimitField.setText(""); + } + } - } + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + + if (!controller.isEnableSQLDatabaseOutput()) { + + boolean zipComp = controller.isZipCompressionEnabled(); + boolean multiFile = controller.isMultipleOutputFiles(); + + builder.append("\t\r\n"); + builder.append("\t\t"); + + OutputCompressionEnum comp = controller.getOutputCompression(); + switch (comp) { + case None: + builder.append(OutputType.UNCOMPRESSED); + break; + case BZip2: + builder.append(OutputType.BZIP2); + break; + case SevenZip: + builder.append(OutputType.SEVENZIP); + break; + default: + throw new RuntimeException("Illegal Output Compression Mode"); + } - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// + builder.append("\r\n"); - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { + String path = this.outputPathField.getText(); - Object o = config - .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - if (o != null) { - this.outputPathField.setText((String) o); - } else { - this.outputPathField.setText(""); - } + if (path == null || path.equals("")) { + errors.add(new ConfigItem(ConfigItemTypes.WARNING, ConfigErrorKeys.MISSING_VALUE, + "No output path has been set.")); + } - o = config - .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); - if (o != null) { - controller.setEnableZipCompression((Boolean) o); - } else { - controller.setEnableZipCompression(false); - } + if (!path.endsWith(File.separator) && path.contains(File.separator)) { + path += File.separator; + } - o = config - .getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT); - if (o != null) { - controller.setEnableDataFileOutput((Boolean) o); - } else { - controller.setEnableDataFileOutput(false); - } + builder.append("\t\t\"" + path + "\"\r\n"); + + if (multiFile) { + + long sizeLimit = -1; + + String text = outputSizeLimitField.getText(); + if (text.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The output limit is missing.")); + } + else { + try { + sizeLimit = Long.parseLong(text); + if (sizeLimit < 100 * 1024 * 1024) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, + ConfigErrorKeys.VALUE_OUT_OF_RANGE, + "The output limit has to be at" + " least 100MB")); + } + } + catch (NumberFormatException nfe) { + errors.add( + new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.ILLEGAL_INPUT, + "NumberFormatException for the" + " output limit")); + } + } + + switch (comp) { + case None: + builder.append( + "\t\t" + sizeLimit + "\r\n"); + break; + default: + builder.append("\t\t" + sizeLimit + + "\r\n"); + break; + } + } - o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - if (o != null) { - switch ((OutputType) o) { - case UNCOMPRESSED: - controller.setEnableSQLDatabaseOutput(false); - controller.setOutputCompression(OutputCompressionEnum.None); - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); - break; - case SEVENZIP: - controller.setEnableSQLDatabaseOutput(false); - controller.setOutputCompression(OutputCompressionEnum.SevenZip); - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - break; - case BZIP2: - controller.setEnableSQLDatabaseOutput(false); - controller.setOutputCompression(OutputCompressionEnum.BZip2); - - o = config - .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - break; - case DATABASE: - controller.setEnableSQLDatabaseOutput(true); - controller.setOutputCompression(OutputCompressionEnum.None); - - o = null; - break; - } - } + builder.append("\t\t" + zipComp + + "\r\n"); - if (o != null) { - controller.setMultipleOutputFiles(true); - this.outputSizeLimitField.setText(Long.toString((Long) o)); - } else { - controller.setMultipleOutputFiles(false); - this.outputSizeLimitField.setText(""); - } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - if (!controller.isEnableSQLDatabaseOutput()) { - - boolean zipComp = controller.isZipCompressionEnabled(); - boolean multiFile = controller.isMultipleOutputFiles(); - - builder.append("\t\r\n"); - builder.append("\t\t"); - - OutputCompressionEnum comp = controller.getOutputCompression(); - switch (comp) { - case None: - builder.append(OutputType.UNCOMPRESSED); - break; - case BZip2: - builder.append(OutputType.BZIP2); - break; - case SevenZip: - builder.append(OutputType.SEVENZIP); - break; - default: - throw new RuntimeException("Illegal Output Compression Mode"); - } - - builder.append("\r\n"); - - String path = this.outputPathField.getText(); - - if (path == null || path.equals("")) { - errors.add(new ConfigItem(ConfigItemTypes.WARNING, - ConfigErrorKeys.MISSING_VALUE, - "No output path has been set.")); - } - - if (!path.endsWith(File.separator) && path.contains(File.separator)) { - path += File.separator; - } - - builder.append("\t\t\"" + path + "\"\r\n"); - - if (multiFile) { - - long sizeLimit = -1; - - String text = outputSizeLimitField.getText(); - if (text.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The output limit is missing.")); - } else { - try { - sizeLimit = Long.parseLong(text); - if (sizeLimit < 100 * 1024 * 1024) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.VALUE_OUT_OF_RANGE, - "The output limit has to be at" - + " least 100MB")); + if (controller.isEnableDataFileOutput()) { + builder.append("\t\ttrue\r\n"); + } + else { + builder.append("\t\tfalse\r\n"); } - } catch (NumberFormatException nfe) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.ILLEGAL_INPUT, - "NumberFormatException for the" - + " output limit")); - } - } - switch (comp) { - case None: - builder.append("\t\t" + sizeLimit - + "\r\n"); - break; - default: - builder.append("\t\t" + sizeLimit - + "\r\n"); - break; + builder.append("\t\r\n"); } - } - - builder.append("\t\t" + zipComp - + "\r\n"); - - if (controller.isEnableDataFileOutput()) { - builder.append("\t\ttrue\r\n"); - } else { - builder.append("\t\tfalse\r\n"); - } - - builder.append("\t\r\n"); } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java index 33a35861..9aa07cc9 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/gui/panels/SQLPanel.java @@ -35,291 +35,298 @@ /** * Panel class of the ConfigurationTool *

- * This panel contains all components for setting configuration parameters - * related to the database output. + * This panel contains all components for setting configuration parameters related to the database + * output. */ @SuppressWarnings("serial") public class SQLPanel - extends AbstractPanel { + extends AbstractPanel +{ + + /** + * (Constructor) Create the SQLPanel object. + * + * @param controller + * Reference to the controller + */ + public SQLPanel(final ConfigController controller) + { + + super(controller); + controller.register(PanelKeys.PANEL_SQL, this); + + createSQLFields(); + createOutputSettings(); + } + + // --------------------------------------------------------------------------// + // CONSTRUCTION METHODS // + // --------------------------------------------------------------------------// + + private JCheckBox enableSQLDatabaseConnection; + private JLabel sqlHostLabel; + private JTextField sqlHostField; + private JLabel sqlDatabaseLabel; + private JTextField sqlDatabaseField; + private JLabel sqlUserLabel; + private JTextField sqlUserField; + private JLabel sqlPasswordLabel; + private JTextField sqlPasswordField; + + private JCheckBox enableZipEncodingCheckBox; + + private void createSQLFields() + { + + enableSQLDatabaseConnection = new JCheckBox("Activate Database Output"); + enableSQLDatabaseConnection.setBounds(10, 10, 200, 25); + + enableSQLDatabaseConnection.addActionListener(e -> { + boolean flag = !controller.isEnableSQLDatabaseOutput(); + controller.setEnableSQLDatabaseOutput(flag); + + validateSQLFields(); + }); + + this.add(enableSQLDatabaseConnection); + + sqlHostLabel = new JLabel("Host"); + sqlHostLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlHostLabel.setBounds(10, 50, 100, 25); + this.add(sqlHostLabel); + + sqlHostField = new JTextField(); + sqlHostField.setBounds(120, 50, 100, 25); + this.add(sqlHostField); + + sqlDatabaseLabel = new JLabel("Database"); + sqlDatabaseLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlDatabaseLabel.setBounds(10, 50, 100, 25); + this.add(sqlDatabaseLabel); + + sqlDatabaseField = new JTextField(); + sqlDatabaseField.setBounds(120, 50, 100, 25); + this.add(sqlDatabaseField); + + sqlUserLabel = new JLabel("User"); + sqlUserLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlUserLabel.setBounds(10, 80, 100, 25); + this.add(sqlUserLabel); - /** - * (Constructor) Create the SQLPanel object. - * - * @param controller Reference to the controller - */ - public SQLPanel(final ConfigController controller) { + sqlUserField = new JTextField(); + sqlUserField.setBounds(120, 80, 100, 25); + this.add(sqlUserField); - super(controller); - controller.register(PanelKeys.PANEL_SQL, this); - - createSQLFields(); - createOutputSettings(); - } - - // --------------------------------------------------------------------------// - // CONSTRUCTION METHODS // - // --------------------------------------------------------------------------// - - private JCheckBox enableSQLDatabaseConnection; - private JLabel sqlHostLabel; - private JTextField sqlHostField; - private JLabel sqlDatabaseLabel; - private JTextField sqlDatabaseField; - private JLabel sqlUserLabel; - private JTextField sqlUserField; - private JLabel sqlPasswordLabel; - private JTextField sqlPasswordField; - - private JCheckBox enableZipEncodingCheckBox; - - private void createSQLFields() { - - enableSQLDatabaseConnection = new JCheckBox( - "Activate Database Output"); - enableSQLDatabaseConnection.setBounds(10, 10, 200, 25); - - enableSQLDatabaseConnection.addActionListener(e -> { - boolean flag = !controller.isEnableSQLDatabaseOutput(); - controller.setEnableSQLDatabaseOutput(flag); - - validateSQLFields(); - }); - - this.add(enableSQLDatabaseConnection); - - sqlHostLabel = new JLabel("Host"); - sqlHostLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlHostLabel.setBounds(10, 50, 100, 25); - this.add(sqlHostLabel); - - sqlHostField = new JTextField(); - sqlHostField.setBounds(120, 50, 100, 25); - this.add(sqlHostField); - - sqlDatabaseLabel = new JLabel("Database"); - sqlDatabaseLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlDatabaseLabel.setBounds(10, 50, 100, 25); - this.add(sqlDatabaseLabel); - - sqlDatabaseField = new JTextField(); - sqlDatabaseField.setBounds(120, 50, 100, 25); - this.add(sqlDatabaseField); - - sqlUserLabel = new JLabel("User"); - sqlUserLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlUserLabel.setBounds(10, 80, 100, 25); - this.add(sqlUserLabel); - - sqlUserField = new JTextField(); - sqlUserField.setBounds(120, 80, 100, 25); - this.add(sqlUserField); - - sqlPasswordLabel = new JLabel("Password"); - sqlPasswordLabel.setBorder(BorderFactory.createRaisedBevelBorder()); - sqlPasswordLabel.setBounds(10, 110, 100, 25); - this.add(sqlPasswordLabel); - - sqlPasswordField = new JTextField(); - sqlPasswordField.setBounds(120, 110, 100, 25); - this.add(sqlPasswordField); - } - - private void createOutputSettings() { - - enableZipEncodingCheckBox = new JCheckBox("Activate Zip Encoding"); - enableZipEncodingCheckBox.setBounds(10, 160, 200, 25); - - enableZipEncodingCheckBox.addActionListener(e -> { - - boolean flag = !controller.isZipCompressionEnabled(); - controller.setEnableZipCompression(flag); - - validateSettings(); - }); - - this.add(enableZipEncodingCheckBox); - } - - // --------------------------------------------------------------------------// - // VALIDATION METHODS // - // --------------------------------------------------------------------------// - - /** - * A call of this method should validate the status of the panels - * components. - */ - @Override - public void validate() { - validateSQLFields(); - validateSettings(); - } - - /** - * Validates the Settings. - */ - private void validateSettings() { - enableZipEncodingCheckBox.setSelected(controller - .isZipCompressionEnabled()); - } - - /** - * Validates the UNCOMPRESSED Settings. - */ - private void validateSQLFields() { - - boolean flag = controller.isEnableSQLDatabaseOutput(); - - enableSQLDatabaseConnection.setSelected(flag); - - sqlHostLabel.setEnabled(flag); - sqlHostField.setEnabled(flag); - sqlDatabaseLabel.setEnabled(flag); - sqlDatabaseField.setEnabled(flag); - sqlUserLabel.setEnabled(flag); - sqlUserField.setEnabled(flag); - sqlPasswordLabel.setEnabled(flag); - sqlPasswordField.setEnabled(flag); - - enableZipEncodingCheckBox.setEnabled(flag); - } - - /** - * A call of this method should validate the positions of the panels - * components. - */ - @Override - public void relocate() { - - int w = 200, h = 235; - - int x = (this.getWidth() - w) / 2; - int y = (this.getHeight() - h) / 2; - - enableSQLDatabaseConnection.setLocation(x, y); - sqlHostLabel.setLocation(x, y + 40); - sqlHostField.setLocation(x + 110, y + 40); - sqlDatabaseLabel.setLocation(x, y + 70); - sqlDatabaseField.setLocation(x + 110, y + 70); - sqlUserLabel.setLocation(x, y + 100); - sqlUserField.setLocation(x + 110, y + 100); - sqlPasswordLabel.setLocation(x, y + 130); - sqlPasswordField.setLocation(x + 110, y + 130); - enableZipEncodingCheckBox.setLocation(x, y + 180); - } - - // --------------------------------------------------------------------------// - // INPUT/OUTPUT METHODS // - // --------------------------------------------------------------------------// - - /** - * Reads the configuration parameters described in the panel from the - * ConfigSettings and and sets the contained values. - * - * @param config Reference to the ConfigSettings object - */ - @Override - public void applyConfig(final ConfigSettings config) { - - Object o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - if ((OutputType) o == OutputType.DATABASE) { - controller.setEnableSQLDatabaseOutput(true); + sqlPasswordLabel = new JLabel("Password"); + sqlPasswordLabel.setBorder(BorderFactory.createRaisedBevelBorder()); + sqlPasswordLabel.setBounds(10, 110, 100, 25); + this.add(sqlPasswordLabel); + + sqlPasswordField = new JTextField(); + sqlPasswordField.setBounds(120, 110, 100, 25); + this.add(sqlPasswordField); } - o = config.getConfigParameter(ConfigurationKeys.SQL_HOST); - if (o != null) { - this.sqlHostField.setText((String) o); - } else { - this.sqlHostField.setText(""); + private void createOutputSettings() + { + + enableZipEncodingCheckBox = new JCheckBox("Activate Zip Encoding"); + enableZipEncodingCheckBox.setBounds(10, 160, 200, 25); + + enableZipEncodingCheckBox.addActionListener(e -> { + + boolean flag = !controller.isZipCompressionEnabled(); + controller.setEnableZipCompression(flag); + + validateSettings(); + }); + + this.add(enableZipEncodingCheckBox); } - o = config.getConfigParameter(ConfigurationKeys.SQL_DATABASE); - if (o != null) { - this.sqlDatabaseField.setText((String) o); - } else { - this.sqlDatabaseField.setText(""); + // --------------------------------------------------------------------------// + // VALIDATION METHODS // + // --------------------------------------------------------------------------// + + /** + * A call of this method should validate the status of the panels components. + */ + @Override + public void validate() + { + validateSQLFields(); + validateSettings(); } - o = config.getConfigParameter(ConfigurationKeys.SQL_USERNAME); - if (o != null) { - this.sqlUserField.setText((String) o); - } else { - this.sqlUserField.setText(""); + /** + * Validates the Settings. + */ + private void validateSettings() + { + enableZipEncodingCheckBox.setSelected(controller.isZipCompressionEnabled()); } - o = config.getConfigParameter(ConfigurationKeys.SQL_PASSWORD); - if (o != null) { - this.sqlPasswordField.setText((String) o); - } else { - this.sqlPasswordField.setText(""); + /** + * Validates the UNCOMPRESSED Settings. + */ + private void validateSQLFields() + { + + boolean flag = controller.isEnableSQLDatabaseOutput(); + + enableSQLDatabaseConnection.setSelected(flag); + + sqlHostLabel.setEnabled(flag); + sqlHostField.setEnabled(flag); + sqlDatabaseLabel.setEnabled(flag); + sqlDatabaseField.setEnabled(flag); + sqlUserLabel.setEnabled(flag); + sqlUserField.setEnabled(flag); + sqlPasswordLabel.setEnabled(flag); + sqlPasswordField.setEnabled(flag); + + enableZipEncodingCheckBox.setEnabled(flag); } - o = config - .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); - if (o != null) { - controller.setEnableZipCompression((Boolean) o); - } else { - controller.setEnableZipCompression(false); + /** + * A call of this method should validate the positions of the panels components. + */ + @Override + public void relocate() + { + + int w = 200, h = 235; + + int x = (this.getWidth() - w) / 2; + int y = (this.getHeight() - h) / 2; + + enableSQLDatabaseConnection.setLocation(x, y); + sqlHostLabel.setLocation(x, y + 40); + sqlHostField.setLocation(x + 110, y + 40); + sqlDatabaseLabel.setLocation(x, y + 70); + sqlDatabaseField.setLocation(x + 110, y + 70); + sqlUserLabel.setLocation(x, y + 100); + sqlUserField.setLocation(x + 110, y + 100); + sqlPasswordLabel.setLocation(x, y + 130); + sqlPasswordField.setLocation(x + 110, y + 130); + enableZipEncodingCheckBox.setLocation(x, y + 180); } - } - - /** - * Adds the xml description of the panels content to the StringBuilder. - * Errors which occur during the xml transformation will be added to the - * ConfigVerification. - * - * @param builder Reference to a StringBuilder object - * @param errors Reference to the ConfigVerification object - */ - @Override - public void toXML(final StringBuilder builder, - final ConfigVerification errors) { - - if (controller.isEnableSQLDatabaseOutput()) { - - String database, user, password, host; - - host = sqlHostField.getText(); - if (host.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The name of the sqlproducer-host is missing.")); - } - - database = sqlDatabaseField.getText(); - if (database.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The name of the sqlproducer-database is missing.")); - } - - user = sqlUserField.getText(); - if (database.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The name of the sqlproducer-user is missing.")); - } - - password = sqlPasswordField.getText(); - if (password.length() == 0) { - errors.add(new ConfigItem(ConfigItemTypes.ERROR, - ConfigErrorKeys.MISSING_VALUE, - "The password of the sqlproducer-user is missing.")); - } - - boolean zipComp = controller.isZipCompressionEnabled(); - - builder.append("\t\r\n"); - builder.append("\t\t" + OutputType.DATABASE - + "\r\n"); - builder.append("\t\t\t\r\n"); - builder.append("\t\t\t\t" + host + "\r\n"); - builder.append("\t\t\t\t" + database + "\r\n"); - builder.append("\t\t\t\t" + user + "\r\n"); - builder.append("\t\t\t\t" + password + "\r\n"); - builder.append("\t\t\t\r\n"); - builder.append("\t\t" + zipComp - + "\r\n"); - builder.append("\t\r\n"); + + // --------------------------------------------------------------------------// + // INPUT/OUTPUT METHODS // + // --------------------------------------------------------------------------// + + /** + * Reads the configuration parameters described in the panel from the ConfigSettings and and + * sets the contained values. + * + * @param config + * Reference to the ConfigSettings object + */ + @Override + public void applyConfig(final ConfigSettings config) + { + + Object o = config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + if ((OutputType) o == OutputType.DATABASE) { + controller.setEnableSQLDatabaseOutput(true); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_HOST); + if (o != null) { + this.sqlHostField.setText((String) o); + } + else { + this.sqlHostField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_DATABASE); + if (o != null) { + this.sqlDatabaseField.setText((String) o); + } + else { + this.sqlDatabaseField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_USERNAME); + if (o != null) { + this.sqlUserField.setText((String) o); + } + else { + this.sqlUserField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.SQL_PASSWORD); + if (o != null) { + this.sqlPasswordField.setText((String) o); + } + else { + this.sqlPasswordField.setText(""); + } + + o = config.getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); + if (o != null) { + controller.setEnableZipCompression((Boolean) o); + } + else { + controller.setEnableZipCompression(false); + } + } + + /** + * Adds the xml description of the panels content to the StringBuilder. Errors which occur + * during the xml transformation will be added to the ConfigVerification. + * + * @param builder + * Reference to a StringBuilder object + * @param errors + * Reference to the ConfigVerification object + */ + @Override + public void toXML(final StringBuilder builder, final ConfigVerification errors) + { + + if (controller.isEnableSQLDatabaseOutput()) { + + String database, user, password, host; + + host = sqlHostField.getText(); + if (host.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The name of the sqlproducer-host is missing.")); + } + + database = sqlDatabaseField.getText(); + if (database.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The name of the sqlproducer-database is missing.")); + } + + user = sqlUserField.getText(); + if (database.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The name of the sqlproducer-user is missing.")); + } + + password = sqlPasswordField.getText(); + if (password.length() == 0) { + errors.add(new ConfigItem(ConfigItemTypes.ERROR, ConfigErrorKeys.MISSING_VALUE, + "The password of the sqlproducer-user is missing.")); + } + + boolean zipComp = controller.isZipCompressionEnabled(); + + builder.append("\t\r\n"); + builder.append("\t\t" + OutputType.DATABASE + "\r\n"); + builder.append("\t\t\t\r\n"); + builder.append("\t\t\t\t" + host + "\r\n"); + builder.append("\t\t\t\t" + database + "\r\n"); + builder.append("\t\t\t\t" + user + "\r\n"); + builder.append("\t\t\t\t" + password + "\r\n"); + builder.append("\t\t\t\r\n"); + builder.append("\t\t" + zipComp + + "\r\n"); + builder.append("\t\r\n"); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java index 94dff615..2ea681e8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/config/simpleconfig/SimpleConfig.java @@ -20,21 +20,23 @@ import org.dkpro.jwpl.revisionmachine.difftool.config.gui.control.ConfigController; /** - * This class is an alternative to the ConfigGUI and can be used to produce - * configuration files for the DiffTool. + * This class is an alternative to the ConfigGUI and can be used to produce configuration files for + * the DiffTool. */ -public class SimpleConfig { - /** - * Reference to the ConfigController - */ - private final ConfigController controller; +public class SimpleConfig +{ + /** + * Reference to the ConfigController + */ + private final ConfigController controller; - /** - * (Constructor) Creates a new ConfigGUI object. - */ - public SimpleConfig() { - this.controller = new ConfigController(); - controller.defaultConfiguration(); - //TODO nothing here yet... - } + /** + * (Constructor) Creates a new ConfigGUI object. + */ + public SimpleConfig() + { + this.controller = new ConfigController(); + controller.defaultConfiguration(); + // TODO nothing here yet... + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java index cc390e12..691076ba 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/ArticleReaderInterface.java @@ -24,38 +24,40 @@ /** * This interface represents the link to the input. */ -public interface ArticleReaderInterface { +public interface ArticleReaderInterface +{ - /** - * Determines whether another task is available or not. - *

- * This method has to be called before calling the next() method. - * - * @return TRUE | FALSE - * @throws ArticleReaderException if the parsing of the input fails - */ - boolean hasNext() throws ArticleReaderException; + /** + * Determines whether another task is available or not. + *

+ * This method has to be called before calling the next() method. + * + * @return TRUE | FALSE + * @throws ArticleReaderException + * if the parsing of the input fails + */ + boolean hasNext() throws ArticleReaderException; - /** - * Returns the next RevisionTask. - * - * @return RevisionTask. - * @throws ArticleReaderException if the parsing of the input fails - */ - Task next() throws ArticleReaderException; + /** + * Returns the next RevisionTask. + * + * @return RevisionTask. + * @throws ArticleReaderException + * if the parsing of the input fails + */ + Task next() throws ArticleReaderException; - /** - * Resets the task processing status of the ArticleReader. - *

- * This method has to be called if the hasNext() or next() methods throw an - * exception. - */ - void resetTaskCompleted(); + /** + * Resets the task processing status of the ArticleReader. + *

+ * This method has to be called if the hasNext() or next() methods throw an exception. + */ + void resetTaskCompleted(); - /** - * Returns the number of bytes that the ArticleReader has processed. - * - * @return number of bytes (current position in the file / archive) - */ - long getBytePosition(); + /** + * Returns the number of bytes that the ArticleReader has processed. + * + * @return number of bytes (current position in the file / archive) + */ + long getBytePosition(); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java index 8ea292f2..0ec6133b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/ArticleFilter.java @@ -29,135 +29,146 @@ /** * Filter articles from unwanted namespaces.
- * The namespaces are read in from the {@code } of the Wikipedia dump. The - * corresponding prefixes of the language version are then used by the filter to - * determine whether an article is part of an unwanted namespace or not.
+ * The namespaces are read in from the {@code } of the Wikipedia dump. The corresponding + * prefixes of the language version are then used by the filter to determine whether an article is + * part of an unwanted namespace or not.
*

- * If the ArticleFilter is not initialized or given an empty list of namespaces, - * nothing is filtered at all. + * If the ArticleFilter is not initialized or given an empty list of namespaces, nothing is filtered + * at all. */ -public class ArticleFilter { - private Map namespaceMap; +public class ArticleFilter +{ + private Map namespaceMap; - private Set prefixesToAllow; + private Set prefixesToAllow; - private Set prefixesToReject; + private Set prefixesToReject; - private final Collection allowedNamespaces; + private final Collection allowedNamespaces; - private boolean excludeMainNamespace; + private boolean excludeMainNamespace; - private final int MAIN_NAMESPACE = 0; + private final int MAIN_NAMESPACE = 0; - private static ConfigurationManager config; + private static ConfigurationManager config; - static { - try { - config = ConfigurationManager.getInstance(); - } catch (ConfigurationException e) { - // TODO logger - System.err.print(e); + static { + try { + config = ConfigurationManager.getInstance(); + } + catch (ConfigurationException e) { + // TODO logger + System.err.print(e); + } } - } - - /** - * Creates an ArticleFilter that uses configuration file to filter prefixes - * - * @throws ConfigurationException - */ - @SuppressWarnings("unchecked") - public ArticleFilter() throws ConfigurationException { - this((Set) config.getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP)); - } - - /** - * Creates a new filter that filters all pages except the namespaces - * provided in the namespaceWhitelist - * - * @param namespaceWhitelist list of namespaces that should NOT be filtered - */ - public ArticleFilter(Collection namespaceWhitelist) { - this.allowedNamespaces = namespaceWhitelist; - - if (!this.allowedNamespaces.contains(MAIN_NAMESPACE)) { - this.excludeMainNamespace = true; + + /** + * Creates an ArticleFilter that uses configuration file to filter prefixes + * + * @throws ConfigurationException + */ + @SuppressWarnings("unchecked") + public ArticleFilter() throws ConfigurationException + { + this((Set) config.getConfigParameter(ConfigurationKeys.NAMESPACES_TO_KEEP)); } - } - - /** - * Initialized the Namespace-Prefix mapping for the current language version - * of Wikipedia. - * - * @param namespaceMap mapping of namespace ids to the corresponding article title - * prefixes - */ - public void initializeNamespaces(Map namespaceMap) { - this.namespaceMap = namespaceMap; - initializePrefixes(); - } - - /** - * Initialize allowed and restricted prefixes - */ - private void initializePrefixes() { - if (namespaceMap == null) { - // TODO use logger - System.err.println("Cannot use whitespace filter without initializing the namespace-prefix map for the " + - "current Wikipedia language version. DISABLING FILTER."); - } else { - prefixesToAllow = new HashSet<>(); - prefixesToReject = new HashSet<>(); - - for (Entry namespace : namespaceMap.entrySet()) { - if (allowedNamespaces.contains(namespace.getKey())) { - prefixesToAllow.add(namespace.getValue() + ":"); - } else { - prefixesToReject.add(namespace.getValue() + ":"); + /** + * Creates a new filter that filters all pages except the namespaces provided in the + * namespaceWhitelist + * + * @param namespaceWhitelist + * list of namespaces that should NOT be filtered + */ + public ArticleFilter(Collection namespaceWhitelist) + { + this.allowedNamespaces = namespaceWhitelist; + + if (!this.allowedNamespaces.contains(MAIN_NAMESPACE)) { + this.excludeMainNamespace = true; } - } - } - } - - /** - * Filter any pages by title prefixes - * - * @param title the page title - * @return true, if the page should be used. false, else - */ - public boolean checkArticle(String title) { - // if filter isn't initialized, do not filter at all - if (namespaceMap == null || namespaceMap.size() == 0 - || allowedNamespaces == null || allowedNamespaces.size() == 0) { - return true; + } - // else, do filter - else { - // perform filtering + /** + * Initialized the Namespace-Prefix mapping for the current language version of Wikipedia. + * + * @param namespaceMap + * mapping of namespace ids to the corresponding article title prefixes + */ + public void initializeNamespaces(Map namespaceMap) + { + this.namespaceMap = namespaceMap; + initializePrefixes(); + } - // reject restricted titles - for (String str : prefixesToReject) { - if (title.startsWith(str)) { - return false; + /** + * Initialize allowed and restricted prefixes + */ + private void initializePrefixes() + { + if (namespaceMap == null) { + // TODO use logger + System.err.println( + "Cannot use whitespace filter without initializing the namespace-prefix map for the " + + "current Wikipedia language version. DISABLING FILTER."); + } + else { + prefixesToAllow = new HashSet<>(); + prefixesToReject = new HashSet<>(); + + for (Entry namespace : namespaceMap.entrySet()) { + if (allowedNamespaces.contains(namespace.getKey())) { + prefixesToAllow.add(namespace.getValue() + ":"); + } + else { + prefixesToReject.add(namespace.getValue() + ":"); + } + } } - } + } - for (String str : prefixesToAllow) { - // allows allowed prefixes - if (title.startsWith(str)) { - return true; + /** + * Filter any pages by title prefixes + * + * @param title + * the page title + * @return true, if the page should be used. false, else + */ + public boolean checkArticle(String title) + { + // if filter isn't initialized, do not filter at all + if (namespaceMap == null || namespaceMap.size() == 0 || allowedNamespaces == null + || allowedNamespaces.size() == 0) { + return true; } - // special case for Main Namespace(Main Namespace has not any - // prefixes) - if (excludeMainNamespace) { - return false; + // else, do filter + else { + + // perform filtering + + // reject restricted titles + for (String str : prefixesToReject) { + if (title.startsWith(str)) { + return false; + } + } + + for (String str : prefixesToAllow) { + // allows allowed prefixes + if (title.startsWith(str)) { + return true; + } + // special case for Main Namespace(Main Namespace has not any + // prefixes) + if (excludeMainNamespace) { + return false; + } + + } + + return true; } - - } - - return true; } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java index 797fa128..49d23e9c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/InputFactory.java @@ -38,171 +38,199 @@ *

* TODO: Add support for alternative commandlines */ -public class InputFactory { - - /** - * Configuration parameter - Path to the 7Zip executable - */ - private static String PATH_PROGRAM_7ZIP = null; - - /** - * Configuration parameter - Charset name of the input data - */ - private static String WIKIPEDIA_ENCODING = null; - - private static ConfigurationManager config = null; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private static boolean MODE_STATISTICAL_OUTPUT = false; - - static { - try { - config = ConfigurationManager.getInstance(); - - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - MODE_STATISTICAL_OUTPUT = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - } catch (ConfigurationException e) { - e.printStackTrace(); - System.exit(-1); - } - } - - /** - * No object - Utility class - */ - private InputFactory() { - } - - /** - * Starts a decompression process using the 7Zip program. - * - * @param archivePath path to the archive - * @return InputStreamReader - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - private static InputStreamReader decompressWith7Zip(final String archivePath) throws ConfigurationException { - PATH_PROGRAM_7ZIP = (String) config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); - - if (PATH_PROGRAM_7ZIP == null) { - throw ErrorFactory.createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); +public class InputFactory +{ + + /** + * Configuration parameter - Path to the 7Zip executable + */ + private static String PATH_PROGRAM_7ZIP = null; + + /** + * Configuration parameter - Charset name of the input data + */ + private static String WIKIPEDIA_ENCODING = null; + + private static ConfigurationManager config = null; + + /** + * Configuration parameter - Flag, that indicates whether the statistical output is enabled or + * not + */ + private static boolean MODE_STATISTICAL_OUTPUT = false; + + static { + try { + config = ConfigurationManager.getInstance(); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + } + catch (ConfigurationException e) { + e.printStackTrace(); + System.exit(-1); + } } - try { - Runtime runtime = Runtime.getRuntime(); - Process p = runtime.exec(PATH_PROGRAM_7ZIP + " e " + archivePath + " -so"); - - return new InputStreamReader(p.getInputStream(), WIKIPEDIA_ENCODING); - - } catch (Exception e) { - throw new RuntimeException(e); + /** + * No object - Utility class + */ + private InputFactory() + { } - } - - /** - * Starts a decompression process using the BZip2 program. - * - * @param archivePath path to the archive - * @return InputStreamReader - */ - private static InputStreamReader decompressWithBZip2(final String archivePath) { - - Bzip2Archiver archiver = new Bzip2Archiver(); - InputStreamReader reader = null; - try { - reader = archiver.getDecompressionStream(archivePath, WIKIPEDIA_ENCODING); - } catch (IOException e) { - - e.printStackTrace(); - } - - return reader; - } - /** - * Creates a reader for the xml file. - * - * @param archivePath path to the xml file - * @return InputStreamReader - */ - private static InputStreamReader readXMLFile(final String archivePath) { - - try { - return new InputStreamReader(new BufferedInputStream(new FileInputStream(archivePath)), WIKIPEDIA_ENCODING); - - } catch (Exception e) { - throw new RuntimeException(e); + /** + * Starts a decompression process using the 7Zip program. + * + * @param archivePath + * path to the archive + * @return InputStreamReader + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + private static InputStreamReader decompressWith7Zip(final String archivePath) + throws ConfigurationException + { + PATH_PROGRAM_7ZIP = (String) config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); + + if (PATH_PROGRAM_7ZIP == null) { + throw ErrorFactory + .createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); + } + + try { + Runtime runtime = Runtime.getRuntime(); + Process p = runtime.exec(PATH_PROGRAM_7ZIP + " e " + archivePath + " -so"); + + return new InputStreamReader(p.getInputStream(), WIKIPEDIA_ENCODING); + + } + catch (Exception e) { + throw new RuntimeException(e); + } } - } - - /** - * Returns an ArticleReader which reads the specified input file. - * - * @param archive input file - * @return ArticleReaderInterface - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws ArticleReaderException if an error occurred while parsing the file - */ - public static ArticleReaderInterface getTaskReader(final ArchiveDescription archive) - throws ConfigurationException, ArticleReaderException { - Reader reader; - - switch (archive.getType()) { - case XML: - reader = readXMLFile(archive.getPath()); - break; - case SEVENZIP: - reader = decompressWith7Zip(archive.getPath()); - break; - case BZIP2: - reader = decompressWithBZip2(archive.getPath()); - break; - default: - throw ErrorFactory.createArticleReaderException( - ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); + + /** + * Starts a decompression process using the BZip2 program. + * + * @param archivePath + * path to the archive + * @return InputStreamReader + */ + private static InputStreamReader decompressWithBZip2(final String archivePath) + { + + Bzip2Archiver archiver = new Bzip2Archiver(); + InputStreamReader reader = null; + try { + reader = archiver.getDecompressionStream(archivePath, WIKIPEDIA_ENCODING); + } + catch (IOException e) { + + e.printStackTrace(); + } + + return reader; } - if (MODE_STATISTICAL_OUTPUT) { - return new TimedWikipediaXMLReader(reader); + /** + * Creates a reader for the xml file. + * + * @param archivePath + * path to the xml file + * @return InputStreamReader + */ + private static InputStreamReader readXMLFile(final String archivePath) + { + + try { + return new InputStreamReader(new BufferedInputStream(new FileInputStream(archivePath)), + WIKIPEDIA_ENCODING); + + } + catch (Exception e) { + throw new RuntimeException(e); + } } - return new WikipediaXMLReader(reader); - } - - /** - * Returns an ArticleReader which reads the specified input file. - * - * @param archive input file - * @param checker the article filter - * @return ArticleReaderInterface - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws ArticleReaderException if an error occurred while parsing the file - */ - public static ArticleReaderInterface getTaskReader(final ArchiveDescription archive, final ArticleFilter checker) - throws ConfigurationException, ArticleReaderException { - Reader reader; - - //TODO add support for (compressed) XMLdumps that are stored in multiple archives - switch (archive.getType()) { - case XML: - reader = readXMLFile(archive.getPath()); - break; - case SEVENZIP: - reader = decompressWith7Zip(archive.getPath()); - break; - case BZIP2: - reader = decompressWithBZip2(archive.getPath()); - break; - default: - throw ErrorFactory - .createArticleReaderException(ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); + + /** + * Returns an ArticleReader which reads the specified input file. + * + * @param archive + * input file + * @return ArticleReaderInterface + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws ArticleReaderException + * if an error occurred while parsing the file + */ + public static ArticleReaderInterface getTaskReader(final ArchiveDescription archive) + throws ConfigurationException, ArticleReaderException + { + Reader reader; + + switch (archive.getType()) { + case XML: + reader = readXMLFile(archive.getPath()); + break; + case SEVENZIP: + reader = decompressWith7Zip(archive.getPath()); + break; + case BZIP2: + reader = decompressWithBZip2(archive.getPath()); + break; + default: + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); + } + + if (MODE_STATISTICAL_OUTPUT) { + return new TimedWikipediaXMLReader(reader); + } + return new WikipediaXMLReader(reader); } - if (MODE_STATISTICAL_OUTPUT) { - return new TimedWikipediaXMLReader(reader, checker); + /** + * Returns an ArticleReader which reads the specified input file. + * + * @param archive + * input file + * @param checker + * the article filter + * @return ArticleReaderInterface + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws ArticleReaderException + * if an error occurred while parsing the file + */ + public static ArticleReaderInterface getTaskReader(final ArchiveDescription archive, + final ArticleFilter checker) + throws ConfigurationException, ArticleReaderException + { + Reader reader; + + // TODO add support for (compressed) XMLdumps that are stored in multiple archives + switch (archive.getType()) { + case XML: + reader = readXMLFile(archive.getPath()); + break; + case SEVENZIP: + reader = decompressWith7Zip(archive.getPath()); + break; + case BZIP2: + reader = decompressWithBZip2(archive.getPath()); + break; + default: + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_INPUTFACTORY_ILLEGAL_INPUTMODE_VALUE); + } + + if (MODE_STATISTICAL_OUTPUT) { + return new TimedWikipediaXMLReader(reader, checker); + } + return new WikipediaXMLReader(reader, checker); } - return new WikipediaXMLReader(reader, checker); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java index e69c3e18..644f1963 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/TimedWikipediaXMLReader.java @@ -28,167 +28,186 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.info.ArticleInformation; /** - * This version of the WikipediaXMLReader collects statistical information - * when it is running. Besides that, it does the same as WikipediaXMLReader. + * This version of the WikipediaXMLReader collects statistical information when it is running. + * Besides that, it does the same as WikipediaXMLReader. */ -public class TimedWikipediaXMLReader extends WikipediaXMLReader { - - /** - * Temporary variable - start position of the article - */ - private long taskStartPosition; - - /** - * Temporary variable - time the parsing of the article started - */ - private long startTime; - - /** - * Temporary variable - time needed to parse the article - */ - private long processingTimeRead; - - /** - * Temporary variable - number of parsed revisions - */ - private int readRevisionCounter; - - /** - * Temporary variable - The time the task entered the system - */ - private long enteringTime; - - /** - * Temporary variable - Flag which indicates that the last task was - * completed - */ - private boolean lastTaskCompleted; - - /** - * (Constructor) Creates a new WikipediaXMLReader. - * - * @param input Reference to the reader - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public TimedWikipediaXMLReader(final Reader input) throws ConfigurationException { - - super(input); - this.lastTaskCompleted = true; - } - - /** - * (Constructor) Creates a new TimedWikipediaXMLReader. - * - * @param input Reference to the reader - * @param articleNameChecker Reference to a name checker - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public TimedWikipediaXMLReader(final Reader input, final ArticleFilter articleNameChecker) - throws ConfigurationException { - - super(input, articleNameChecker); - } - - /** - * Reads the header of an article. - * - * @return FALSE if the article was not accepted by the articleNameChecker - * TRUE if no name checker was used, or if the articleNameChecker - * accepted the ArticleName - * @throws IOException if an error occurs while reading from the input - * @throws ArticleReaderException if an error occurs while parsing the input - */ - @Override - protected boolean readHeader() throws IOException, ArticleReaderException { - this.enteringTime = startTime; - return super.readHeader(); - } - - /** - * Reads a single revision from an article. - * - * @return Revision - * @throws IOException if an error occurs while reading from the input - * @throws ArticleReaderException if an error occurs while parsing the input - */ - @Override - protected Revision readRevision() throws IOException, ArticleReaderException { - - Revision rev = super.readRevision(); - this.readRevisionCounter++; - return rev; - } - - /** - * Determines whether another task is available or not. - *

- * This method has to be called before calling the next() method. - * - * @return TRUE | FALSE - * @throws ArticleReaderException if the parsing of the input fails - */ - @Override - public boolean hasNext() throws ArticleReaderException { - - if (super.hasNext()) { - - if (lastTaskCompleted) { - this.taskStartPosition = this.getBytePosition(); - this.processingTimeRead = 0; - this.readRevisionCounter = 0; - this.lastTaskCompleted = false; - } - - return true; +public class TimedWikipediaXMLReader + extends WikipediaXMLReader +{ + + /** + * Temporary variable - start position of the article + */ + private long taskStartPosition; + + /** + * Temporary variable - time the parsing of the article started + */ + private long startTime; + + /** + * Temporary variable - time needed to parse the article + */ + private long processingTimeRead; + + /** + * Temporary variable - number of parsed revisions + */ + private int readRevisionCounter; + + /** + * Temporary variable - The time the task entered the system + */ + private long enteringTime; + + /** + * Temporary variable - Flag which indicates that the last task was completed + */ + private boolean lastTaskCompleted; + + /** + * (Constructor) Creates a new WikipediaXMLReader. + * + * @param input + * Reference to the reader + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public TimedWikipediaXMLReader(final Reader input) throws ConfigurationException + { + + super(input); + this.lastTaskCompleted = true; } - return false; - } - /** - * Returns the next RevisionTask. - * - * @return RevisionTask. - * @throws ArticleReaderException if the parsing of the input fails - */ - @Override - public Task next() throws ArticleReaderException { - this.startTime = System.currentTimeMillis(); - - Task task = super.next(); + /** + * (Constructor) Creates a new TimedWikipediaXMLReader. + * + * @param input + * Reference to the reader + * @param articleNameChecker + * Reference to a name checker + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public TimedWikipediaXMLReader(final Reader input, final ArticleFilter articleNameChecker) + throws ConfigurationException + { + + super(input, articleNameChecker); + } - processingTimeRead += System.currentTimeMillis() - startTime; + /** + * Reads the header of an article. + * + * @return FALSE if the article was not accepted by the articleNameChecker TRUE if no name + * checker was used, or if the articleNameChecker accepted the ArticleName + * @throws IOException + * if an error occurs while reading from the input + * @throws ArticleReaderException + * if an error occurs while parsing the input + */ + @Override + protected boolean readHeader() throws IOException, ArticleReaderException + { + this.enteringTime = startTime; + return super.readHeader(); + } - if (task != null) { - if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST - || task.getTaskType() == TaskTypes.TASK_FULL) { + /** + * Reads a single revision from an article. + * + * @return Revision + * @throws IOException + * if an error occurs while reading from the input + * @throws ArticleReaderException + * if an error occurs while parsing the input + */ + @Override + protected Revision readRevision() throws IOException, ArticleReaderException + { + + Revision rev = super.readRevision(); + this.readRevisionCounter++; + return rev; + } - lastTaskCompleted = true; + /** + * Determines whether another task is available or not. + *

+ * This method has to be called before calling the next() method. + * + * @return TRUE | FALSE + * @throws ArticleReaderException + * if the parsing of the input fails + */ + @Override + public boolean hasNext() throws ArticleReaderException + { + + if (super.hasNext()) { + + if (lastTaskCompleted) { + this.taskStartPosition = this.getBytePosition(); + this.processingTimeRead = 0; + this.readRevisionCounter = 0; + this.lastTaskCompleted = false; + } + + return true; + } + return false; + } - ArticleInformation info = task.getHeader(); - info.setEnteringTime(enteringTime); - info.setOriginalSize(this.getBytePosition() - taskStartPosition); - info.setProcessingTimeRead(processingTimeRead); - info.setReadRevisionCounter(readRevisionCounter); - - } else { - lastTaskCompleted = false; - } - } else { - lastTaskCompleted = true; + /** + * Returns the next RevisionTask. + * + * @return RevisionTask. + * @throws ArticleReaderException + * if the parsing of the input fails + */ + @Override + public Task next() throws ArticleReaderException + { + this.startTime = System.currentTimeMillis(); + + Task task = super.next(); + + processingTimeRead += System.currentTimeMillis() - startTime; + + if (task != null) { + if (task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST + || task.getTaskType() == TaskTypes.TASK_FULL) { + + lastTaskCompleted = true; + + ArticleInformation info = task.getHeader(); + info.setEnteringTime(enteringTime); + info.setOriginalSize(this.getBytePosition() - taskStartPosition); + info.setProcessingTimeRead(processingTimeRead); + info.setReadRevisionCounter(readRevisionCounter); + + } + else { + lastTaskCompleted = false; + } + } + else { + lastTaskCompleted = true; + } + + return task; } - return task; - } - - /** - * Resets the task processing status of the ArticleReader. - *

- * This method has to be called if the hasNext() or next() methods throw an - * exception. - */ - @Override - public void resetTaskCompleted() { - lastTaskCompleted = true; - super.resetTaskCompleted(); - } + /** + * Resets the task processing status of the ArticleReader. + *

+ * This method has to be called if the hasNext() or next() methods throw an exception. + */ + @Override + public void resetTaskCompleted() + { + lastTaskCompleted = true; + super.resetTaskCompleted(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java index 6d39083a..7399c7f5 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/article/reader/WikipediaXMLReader.java @@ -50,639 +50,668 @@ /** * This class parses the wikipedia xml format. */ -public class WikipediaXMLReader implements ArticleReaderInterface { - - /** - * Reference to the reader - */ - private Reader input; - - /** - * Current position in the xml content - */ - private long bytePosition; - - /** - * Reference to the xml keyword tree - */ - private SingleKeywordTree keywords; - - /** - * Configuration parameter - Maximum size of a revision task - */ - private final long LIMIT_TASK_SIZE_REVISIONS; - - /** - * Reference to the article filter - */ - private ArticleFilter articleFilter; - - /** - * Creates a new WikipediaXMLReader. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - private WikipediaXMLReader() - throws ConfigurationException { - - this.bytePosition = 0; - - this.taskHeader = null; - this.lastTaskCompleted = true; - - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_TASK_SIZE_REVISIONS = (Long) config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); - - initXMLKeys(); - - } - - /** - * Creates a new WikipediaXMLReader. - * - * @param input Reference to the reader - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public WikipediaXMLReader(final Reader input) - throws ConfigurationException { - - this(); - this.articleFilter = null; - this.input = input; - initNamespaces(); - } - - /** - * Creates a new WikipediaXMLReader. - * - * @param input Reference to the reader - * @param articleNameChecker Reference to a name checker - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public WikipediaXMLReader(final Reader input, - final ArticleFilter articleNameChecker) - throws ConfigurationException { - - this(); - this.articleFilter = articleNameChecker; - this.input = input; - initNamespaces(); - - } - - /** - * Creates and initializes the xml keyword tree. - */ - private void initXMLKeys() { - this.keywords = new SingleKeywordTree<>(); - - keywords.addKeyword(WikipediaXMLKeys.KEY_START_PAGE.getKeyword(), - WikipediaXMLKeys.KEY_START_PAGE); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_PAGE.getKeyword(), - WikipediaXMLKeys.KEY_END_PAGE); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_TITLE.getKeyword(), - WikipediaXMLKeys.KEY_START_TITLE); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_TITLE.getKeyword(), - WikipediaXMLKeys.KEY_END_TITLE); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_ID.getKeyword(), - WikipediaXMLKeys.KEY_START_ID); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_ID.getKeyword(), - WikipediaXMLKeys.KEY_END_ID); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_REVISION.getKeyword(), - WikipediaXMLKeys.KEY_START_REVISION); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_REVISION.getKeyword(), - WikipediaXMLKeys.KEY_END_REVISION); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword(), - WikipediaXMLKeys.KEY_START_TIMESTAMP); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword(), - WikipediaXMLKeys.KEY_END_TIMESTAMP); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_TEXT.getKeyword(), - WikipediaXMLKeys.KEY_START_TEXT); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_TEXT.getKeyword(), - WikipediaXMLKeys.KEY_END_TEXT); - keywords.addKeyword(WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword(), - WikipediaXMLKeys.KEY_MINOR_FLAG); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_COMMENT.getKeyword(), - WikipediaXMLKeys.KEY_START_COMMENT); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword(), - WikipediaXMLKeys.KEY_END_COMMENT); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_IP.getKeyword(), - WikipediaXMLKeys.KEY_START_IP); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_IP.getKeyword(), - WikipediaXMLKeys.KEY_END_IP); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_USERNAME.getKeyword(), - WikipediaXMLKeys.KEY_START_USERNAME); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword(), - WikipediaXMLKeys.KEY_END_USERNAME); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword(), - WikipediaXMLKeys.KEY_START_CONTRIBUTOR); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword(), - WikipediaXMLKeys.KEY_END_CONTRIBUTOR); - keywords.addKeyword(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword(), - WikipediaXMLKeys.KEY_START_NAMESPACES); - keywords.addKeyword(WikipediaXMLKeys.KEY_END_NAMESPACES.getKeyword(), - WikipediaXMLKeys.KEY_END_NAMESPACES); - } - - /** - * Reads the namespaces from the siteinfo section and processes them - * in order to initialize the ArticleFilter - */ - private void initNamespaces() { - Map namespaceMap = new HashMap<>(); - try { - int b = read(); - - this.keywords.reset(); - StringBuilder buffer = null; - - while (b != -1) { -// System.out.print((char)b); - - if (buffer != null) { - buffer.append((char) b); - } +public class WikipediaXMLReader + implements ArticleReaderInterface +{ - if (this.keywords.check((char) b)) { - switch (this.keywords.getValue()) { + /** + * Reference to the reader + */ + private Reader input; - case KEY_START_NAMESPACES: - buffer = new StringBuilder(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword()); - break; + /** + * Current position in the xml content + */ + private long bytePosition; - case KEY_END_NAMESPACES: - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setIgnoringElementContentWhitespace(true); - Document namespaces = factory.newDocumentBuilder().parse(new InputSource(new StringReader(buffer.toString()))); + /** + * Reference to the xml keyword tree + */ + private SingleKeywordTree keywords; + /** + * Configuration parameter - Maximum size of a revision task + */ + private final long LIMIT_TASK_SIZE_REVISIONS; - NodeList nsList = namespaces.getChildNodes().item(0).getChildNodes(); + /** + * Reference to the article filter + */ + private ArticleFilter articleFilter; - for (int i = 0; i < nsList.getLength(); i++) { - Node curNamespace = nsList.item(i); + /** + * Creates a new WikipediaXMLReader. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + private WikipediaXMLReader() throws ConfigurationException + { - //get the prefix for the current namespace - String prefix = curNamespace.getTextContent().trim(); - if (!prefix.isEmpty()) { - NamedNodeMap nsAttributes = curNamespace.getAttributes(); - String namespace = nsAttributes.getNamedItem("key").getTextContent(); - namespaceMap.put(Integer.parseInt(namespace), prefix); - } - } + this.bytePosition = 0; - articleFilter.initializeNamespaces(namespaceMap); - return; //init done + this.taskHeader = null; + this.lastTaskCompleted = true; - } + ConfigurationManager config = ConfigurationManager.getInstance(); - this.keywords.reset(); - } + LIMIT_TASK_SIZE_REVISIONS = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_REVISIONS); + + initXMLKeys(); - b = read(); - } - } catch (IOException e) { - System.err.println("Error reading namespaces from xml dump."); - } catch (ParserConfigurationException | SAXException e) { - System.err.println("Error parsing namespace data."); } - } - - /** - * Reads a single byte - * - * @return integer value of the byte or -1 if the end of the stream was - * reached - * @throws IOException if an error occurs while reading the input - */ - private int read() throws IOException { - this.bytePosition++; - return input.read(); - } - - /** - * Temporary variable - reference to the article information - */ - private ArticleInformation taskHeader; - - /** - * Temporary variable - Flag which indicates that the last task was - * completed - */ - private boolean lastTaskCompleted; - - /** - * Temporary variable - Task part counter - */ - private int taskPartCounter; - - /** - * Temporary variable - Task revision counter - */ - private int taskRevisionCounter; - - /** - * Determines whether another task is available or not. - *

- * This method has to be called before calling the next() method. - * - * @return TRUE | FALSE - * @throws ArticleReaderException if the parsing of the input fails - */ - public boolean hasNext() throws ArticleReaderException { - - try { - if (!this.lastTaskCompleted) { - return true; - } - - this.keywords.reset(); - - int b = read(); - while (b != -1) { - - if (keywords.check((char) b)) { - switch (keywords.getValue()) { - case KEY_START_PAGE: - // taskStartPosition = bytePosition; - return true; - } - keywords.reset(); - } - b = read(); - } + /** + * Creates a new WikipediaXMLReader. + * + * @param input + * Reference to the reader + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public WikipediaXMLReader(final Reader input) throws ConfigurationException + { + + this(); + this.articleFilter = null; + this.input = input; + initNamespaces(); + } - return false; + /** + * Creates a new WikipediaXMLReader. + * + * @param input + * Reference to the reader + * @param articleNameChecker + * Reference to a name checker + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public WikipediaXMLReader(final Reader input, final ArticleFilter articleNameChecker) + throws ConfigurationException + { + + this(); + this.articleFilter = articleNameChecker; + this.input = input; + initNamespaces(); - } catch (Exception e) { - throw new ArticleReaderException(e); } - } - - /** - * Reads the header of an article. - * - * @return FALSE if the article was not accepted by the articleFilter - * TRUE if no name checker was used, or if the articleFilter - * accepted the ArticleName - * @throws IOException if an error occurs while reading from the input - * @throws ArticleReaderException if an error occurs while parsing the input - */ - protected boolean readHeader() throws IOException, ArticleReaderException { - - this.taskHeader = new ArticleInformation(); - - int size, r = read(); - StringBuilder buffer = null; - - while (r != -1) { - - if (buffer != null) { - buffer.append((char) r); - } - - if (this.keywords.check((char) r)) { - switch (this.keywords.getValue()) { - - case KEY_START_TITLE: - case KEY_START_ID: - buffer = new StringBuilder(); - break; - - case KEY_END_TITLE: - size = buffer.length(); - buffer.delete(size - - WikipediaXMLKeys.KEY_END_TITLE.getKeyword() - .length(), size); - - this.taskHeader.setArticleName(buffer.toString()); - if (this.articleFilter != null) { - if (!this.articleFilter - .checkArticle(this.taskHeader.getArticleName())) { - return false; - } - } - - buffer = null; - break; - case KEY_END_ID: - size = buffer.length(); - buffer.delete( - size - - WikipediaXMLKeys.KEY_END_ID.getKeyword() - .length(), size); + /** + * Creates and initializes the xml keyword tree. + */ + private void initXMLKeys() + { + this.keywords = new SingleKeywordTree<>(); + + keywords.addKeyword(WikipediaXMLKeys.KEY_START_PAGE.getKeyword(), + WikipediaXMLKeys.KEY_START_PAGE); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_PAGE.getKeyword(), + WikipediaXMLKeys.KEY_END_PAGE); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_TITLE.getKeyword(), + WikipediaXMLKeys.KEY_START_TITLE); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_TITLE.getKeyword(), + WikipediaXMLKeys.KEY_END_TITLE); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_ID.getKeyword(), + WikipediaXMLKeys.KEY_START_ID); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_ID.getKeyword(), WikipediaXMLKeys.KEY_END_ID); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_REVISION.getKeyword(), + WikipediaXMLKeys.KEY_START_REVISION); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_REVISION.getKeyword(), + WikipediaXMLKeys.KEY_END_REVISION); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_TIMESTAMP.getKeyword(), + WikipediaXMLKeys.KEY_START_TIMESTAMP); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword(), + WikipediaXMLKeys.KEY_END_TIMESTAMP); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_TEXT.getKeyword(), + WikipediaXMLKeys.KEY_START_TEXT); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_TEXT.getKeyword(), + WikipediaXMLKeys.KEY_END_TEXT); + keywords.addKeyword(WikipediaXMLKeys.KEY_MINOR_FLAG.getKeyword(), + WikipediaXMLKeys.KEY_MINOR_FLAG); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_COMMENT.getKeyword(), + WikipediaXMLKeys.KEY_START_COMMENT); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_COMMENT.getKeyword(), + WikipediaXMLKeys.KEY_END_COMMENT); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_IP.getKeyword(), + WikipediaXMLKeys.KEY_START_IP); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_IP.getKeyword(), WikipediaXMLKeys.KEY_END_IP); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_USERNAME.getKeyword(), + WikipediaXMLKeys.KEY_START_USERNAME); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_USERNAME.getKeyword(), + WikipediaXMLKeys.KEY_END_USERNAME); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_CONTRIBUTOR.getKeyword(), + WikipediaXMLKeys.KEY_START_CONTRIBUTOR); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword(), + WikipediaXMLKeys.KEY_END_CONTRIBUTOR); + keywords.addKeyword(WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword(), + WikipediaXMLKeys.KEY_START_NAMESPACES); + keywords.addKeyword(WikipediaXMLKeys.KEY_END_NAMESPACES.getKeyword(), + WikipediaXMLKeys.KEY_END_NAMESPACES); + } - this.taskHeader.setArticleId(Integer.parseInt(buffer - .toString())); - buffer = null; - break; + /** + * Reads the namespaces from the siteinfo section and processes them in order to initialize the + * ArticleFilter + */ + private void initNamespaces() + { + Map namespaceMap = new HashMap<>(); + try { + int b = read(); - case KEY_START_REVISION: this.keywords.reset(); - return true; + StringBuilder buffer = null; - default: - throw ErrorFactory.createArticleReaderException( - ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); - } + while (b != -1) { + // System.out.print((char)b); - this.keywords.reset(); - } + if (buffer != null) { + buffer.append((char) b); + } + + if (this.keywords.check((char) b)) { + switch (this.keywords.getValue()) { + + case KEY_START_NAMESPACES: + buffer = new StringBuilder( + WikipediaXMLKeys.KEY_START_NAMESPACES.getKeyword()); + break; + + case KEY_END_NAMESPACES: + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setIgnoringElementContentWhitespace(true); + Document namespaces = factory.newDocumentBuilder() + .parse(new InputSource(new StringReader(buffer.toString()))); + + NodeList nsList = namespaces.getChildNodes().item(0).getChildNodes(); + + for (int i = 0; i < nsList.getLength(); i++) { + Node curNamespace = nsList.item(i); + + // get the prefix for the current namespace + String prefix = curNamespace.getTextContent().trim(); + if (!prefix.isEmpty()) { + NamedNodeMap nsAttributes = curNamespace.getAttributes(); + String namespace = nsAttributes.getNamedItem("key") + .getTextContent(); + namespaceMap.put(Integer.parseInt(namespace), prefix); + } + } + + articleFilter.initializeNamespaces(namespaceMap); + return; // init done + + } - r = read(); + this.keywords.reset(); + } + + b = read(); + } + } + catch (IOException e) { + System.err.println("Error reading namespaces from xml dump."); + } + catch (ParserConfigurationException | SAXException e) { + System.err.println("Error parsing namespace data."); + } } - throw ErrorFactory.createArticleReaderException( - ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); - } + /** + * Reads a single byte + * + * @return integer value of the byte or -1 if the end of the stream was reached + * @throws IOException + * if an error occurs while reading the input + */ + private int read() throws IOException + { + this.bytePosition++; + return input.read(); + } - /** - * Reads a single revision from an article. - * - * @return Revision - * @throws IOException if an error occurs while reading from the input - * @throws ArticleReaderException if an error occurs while parsing the input - */ - protected Revision readRevision() throws IOException, ArticleReaderException { + /** + * Temporary variable - reference to the article information + */ + private ArticleInformation taskHeader; + + /** + * Temporary variable - Flag which indicates that the last task was completed + */ + private boolean lastTaskCompleted; + + /** + * Temporary variable - Task part counter + */ + private int taskPartCounter; + + /** + * Temporary variable - Task revision counter + */ + private int taskRevisionCounter; + + /** + * Determines whether another task is available or not. + *

+ * This method has to be called before calling the next() method. + * + * @return TRUE | FALSE + * @throws ArticleReaderException + * if the parsing of the input fails + */ + public boolean hasNext() throws ArticleReaderException + { + + try { + if (!this.lastTaskCompleted) { + return true; + } - this.taskRevisionCounter++; - Revision revision = new Revision(this.taskRevisionCounter); + this.keywords.reset(); - int size, r = read(); - boolean hasId = false; + int b = read(); + while (b != -1) { - StringBuilder buffer = null; - this.keywords.reset(); + if (keywords.check((char) b)) { + switch (keywords.getValue()) { + case KEY_START_PAGE: + // taskStartPosition = bytePosition; + return true; + } + keywords.reset(); + } - while (r != -1) { + b = read(); + } - if (buffer != null) { - buffer.append((char) r); - } + return false; - if (this.keywords.check((char) r)) { - switch (this.keywords.getValue()) { + } + catch (Exception e) { + throw new ArticleReaderException(e); + } + } - case KEY_START_TEXT: + /** + * Reads the header of an article. + * + * @return FALSE if the article was not accepted by the articleFilter TRUE if no name checker + * was used, or if the articleFilter accepted the ArticleName + * @throws IOException + * if an error occurs while reading from the input + * @throws ArticleReaderException + * if an error occurs while parsing the input + */ + protected boolean readHeader() throws IOException, ArticleReaderException + { - case KEY_START_TIMESTAMP: + this.taskHeader = new ArticleInformation(); - case KEY_START_COMMENT: + int size, r = read(); + StringBuilder buffer = null; - case KEY_START_CONTRIBUTOR: - buffer = new StringBuilder(); - break; + while (r != -1) { - case KEY_START_ID: - if (!hasId) { - buffer = new StringBuilder(); + if (buffer != null) { + buffer.append((char) r); } - break; - case KEY_END_ID: - if (!hasId) { - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_ID.getKeyword().length(), size); + if (this.keywords.check((char) r)) { + switch (this.keywords.getValue()) { + + case KEY_START_TITLE: + case KEY_START_ID: + buffer = new StringBuilder(); + break; + + case KEY_END_TITLE: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_TITLE.getKeyword().length(), + size); + + this.taskHeader.setArticleName(buffer.toString()); + if (this.articleFilter != null) { + if (!this.articleFilter.checkArticle(this.taskHeader.getArticleName())) { + return false; + } + } + + buffer = null; + break; + + case KEY_END_ID: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_ID.getKeyword().length(), size); + + this.taskHeader.setArticleId(Integer.parseInt(buffer.toString())); + buffer = null; + break; - revision.setRevisionID(Integer.parseInt(buffer.toString())); - buffer = null; + case KEY_START_REVISION: + this.keywords.reset(); + return true; - hasId = true; + default: + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); + } + + this.keywords.reset(); } - break; - - case KEY_END_TIMESTAMP: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword().length(), size); - - revision.setTimeStamp(buffer.toString()); - buffer = null; - break; - - case KEY_END_TEXT: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_TEXT.getKeyword().length(), size); - - revision.setRevisionText(buffer.toString()); - buffer = null; - break; - - case KEY_END_COMMENT: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_COMMENT.getKeyword().length(), size); - //escape comment string - revision.setComment(SQLEscape.escape(buffer.toString())); - buffer = null; - break; - - case KEY_END_CONTRIBUTOR: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword().length(), size); - //escape id string - readContributor(revision, buffer.toString()); - buffer = null; - break; - - case KEY_MINOR_FLAG: - revision.setMinor(true); - buffer = null; - break; - - case KEY_END_REVISION: - this.keywords.reset(); - return revision; - - //the following cases are handled in readContributor() - //they can be skipped here - case KEY_START_IP: - case KEY_END_IP: - case KEY_START_USERNAME: - case KEY_END_USERNAME: - break; - - default: - System.out.println(keywords.getValue()); - throw ErrorFactory.createArticleReaderException( - ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); + + r = read(); } + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); + } + + /** + * Reads a single revision from an article. + * + * @return Revision + * @throws IOException + * if an error occurs while reading from the input + * @throws ArticleReaderException + * if an error occurs while parsing the input + */ + protected Revision readRevision() throws IOException, ArticleReaderException + { + + this.taskRevisionCounter++; + Revision revision = new Revision(this.taskRevisionCounter); + + int size, r = read(); + boolean hasId = false; + + StringBuilder buffer = null; this.keywords.reset(); - } - r = read(); + while (r != -1) { + + if (buffer != null) { + buffer.append((char) r); + } + + if (this.keywords.check((char) r)) { + switch (this.keywords.getValue()) { + + case KEY_START_TEXT: + + case KEY_START_TIMESTAMP: + + case KEY_START_COMMENT: + + case KEY_START_CONTRIBUTOR: + buffer = new StringBuilder(); + break; + + case KEY_START_ID: + if (!hasId) { + buffer = new StringBuilder(); + } + break; + + case KEY_END_ID: + if (!hasId) { + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_ID.getKeyword().length(), + size); + + revision.setRevisionID(Integer.parseInt(buffer.toString())); + buffer = null; + + hasId = true; + } + break; + + case KEY_END_TIMESTAMP: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_TIMESTAMP.getKeyword().length(), + size); + + revision.setTimeStamp(buffer.toString()); + buffer = null; + break; + + case KEY_END_TEXT: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_TEXT.getKeyword().length(), size); + + revision.setRevisionText(buffer.toString()); + buffer = null; + break; + + case KEY_END_COMMENT: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_COMMENT.getKeyword().length(), + size); + // escape comment string + revision.setComment(SQLEscape.escape(buffer.toString())); + buffer = null; + break; + + case KEY_END_CONTRIBUTOR: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_CONTRIBUTOR.getKeyword().length(), + size); + // escape id string + readContributor(revision, buffer.toString()); + buffer = null; + break; + + case KEY_MINOR_FLAG: + revision.setMinor(true); + buffer = null; + break; + + case KEY_END_REVISION: + this.keywords.reset(); + return revision; + + // the following cases are handled in readContributor() + // they can be skipped here + case KEY_START_IP: + case KEY_END_IP: + case KEY_START_USERNAME: + case KEY_END_USERNAME: + break; + + default: + System.out.println(keywords.getValue()); + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_KEYWORD); + } + + this.keywords.reset(); + } + + r = read(); + } + + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); } - throw ErrorFactory.createArticleReaderException( - ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); - } - - /** - * Parses the content within the contributor tags and adds the - * parsed info to the provided revision object. - * - * @param rev the revision object to store the parsed info in - * @param str the contributor data to be parsed - */ - protected void readContributor(Revision rev, String str) { - char[] contrChars = str.toCharArray(); - int size; - - StringBuilder buffer = null; - this.keywords.reset(); - - for (char curChar : contrChars) { - - if (buffer != null) { - buffer.append(curChar); - } - - if (this.keywords.check(curChar)) { - - switch (this.keywords.getValue()) { - - case KEY_START_ID: - case KEY_START_IP: - case KEY_START_USERNAME: - buffer = new StringBuilder(); - break; - - case KEY_END_IP: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_IP.getKeyword().length(), size); - // escape id string - rev.setContributorName(SQLEscape.escape(buffer.toString())); - rev.setContributorIsRegistered(false); - buffer = null; - break; - - case KEY_END_USERNAME: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_USERNAME.getKeyword().length(), size); - // escape id string - rev.setContributorName(SQLEscape.escape(buffer.toString())); - rev.setContributorIsRegistered(true); - buffer = null; - break; - - case KEY_END_ID: - size = buffer.length(); - buffer.delete(size - WikipediaXMLKeys.KEY_END_ID.getKeyword().length(), size); - String id = buffer.toString(); - if (!id.isEmpty()) { - rev.setContributorId(Integer.parseInt(buffer.toString())); + /** + * Parses the content within the contributor tags and adds the parsed info to the provided + * revision object. + * + * @param rev + * the revision object to store the parsed info in + * @param str + * the contributor data to be parsed + */ + protected void readContributor(Revision rev, String str) + { + char[] contrChars = str.toCharArray(); + int size; + + StringBuilder buffer = null; + this.keywords.reset(); + + for (char curChar : contrChars) { + + if (buffer != null) { + buffer.append(curChar); + } + + if (this.keywords.check(curChar)) { + + switch (this.keywords.getValue()) { + + case KEY_START_ID: + case KEY_START_IP: + case KEY_START_USERNAME: + buffer = new StringBuilder(); + break; + + case KEY_END_IP: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_IP.getKeyword().length(), size); + // escape id string + rev.setContributorName(SQLEscape.escape(buffer.toString())); + rev.setContributorIsRegistered(false); + buffer = null; + break; + + case KEY_END_USERNAME: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_USERNAME.getKeyword().length(), + size); + // escape id string + rev.setContributorName(SQLEscape.escape(buffer.toString())); + rev.setContributorIsRegistered(true); + buffer = null; + break; + + case KEY_END_ID: + size = buffer.length(); + buffer.delete(size - WikipediaXMLKeys.KEY_END_ID.getKeyword().length(), size); + String id = buffer.toString(); + if (!id.isEmpty()) { + rev.setContributorId(Integer.parseInt(buffer.toString())); + } + buffer = null; + break; + } } - buffer = null; - break; } - } } - } - /** - * Returns the next RevisionTask. - * - * @return RevisionTask. - * @throws ArticleReaderException if the parsing of the input fails - */ - @Override - public Task next() throws ArticleReaderException { + /** + * Returns the next RevisionTask. + * + * @return RevisionTask. + * @throws ArticleReaderException + * if the parsing of the input fails + */ + @Override + public Task next() throws ArticleReaderException + { + + try { + this.keywords.reset(); - try { - this.keywords.reset(); + // if new article read header, otherwise use old one + if (this.lastTaskCompleted) { + this.lastTaskCompleted = false; - // if new article read header, otherwise use old one - if (this.lastTaskCompleted) { - this.lastTaskCompleted = false; + this.taskPartCounter = 1; + this.taskRevisionCounter = -1; - this.taskPartCounter = 1; - this.taskRevisionCounter = -1; + if (!readHeader()) { - if (!readHeader()) { + this.lastTaskCompleted = true; + return null; - this.lastTaskCompleted = true; - return null; + } + } + else { + this.taskPartCounter++; + } - } - } else { - this.taskPartCounter++; - } + Task task = new Task<>(this.taskHeader, this.taskPartCounter); + task.add(readRevision()); - Task task = new Task<>(this.taskHeader, - this.taskPartCounter); - task.add(readRevision()); + int r = read(); + while (r != -1) { + if (this.keywords.check((char) r)) { - int r = read(); - while (r != -1) { - if (this.keywords.check((char) r)) { + switch (this.keywords.getValue()) { - switch (this.keywords.getValue()) { + case KEY_START_REVISION: - case KEY_START_REVISION: + if (task.byteSize() >= LIMIT_TASK_SIZE_REVISIONS) { + this.lastTaskCompleted = false; - if (task.byteSize() >= LIMIT_TASK_SIZE_REVISIONS) { - this.lastTaskCompleted = false; + if (this.taskPartCounter == 1) { + task.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); + } + else { + task.setTaskType(TaskTypes.TASK_PARTIAL); + } - if (this.taskPartCounter == 1) { - task.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); - } else { - task.setTaskType(TaskTypes.TASK_PARTIAL); - } + return task; + } - return task; - } + task.add(readRevision()); + break; - task.add(readRevision()); - break; + case KEY_END_PAGE: - case KEY_END_PAGE: + this.lastTaskCompleted = true; + if (this.taskPartCounter > 1) { + task.setTaskType(TaskTypes.TASK_PARTIAL_LAST); + } - this.lastTaskCompleted = true; - if (this.taskPartCounter > 1) { - task.setTaskType(TaskTypes.TASK_PARTIAL_LAST); - } + return task; - return task; + default: + throw new IOException(); + } - default: - throw new IOException(); - } + this.keywords.reset(); + } - this.keywords.reset(); - } + r = read(); + } + + throw ErrorFactory.createArticleReaderException( + ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); - r = read(); - } + } + catch (ArticleReaderException e) { + throw e; + } + catch (Exception e) { + throw new ArticleReaderException(e); + } + } - throw ErrorFactory.createArticleReaderException( - ErrorKeys.DELTA_CONSUMERS_TASK_READER_WIKIPEDIAXMLREADER_UNEXPECTED_END_OF_FILE); + /** + * Resets the task processing status of the ArticleReader. + *

+ * This method has to be called if the hasNext() or next() methods throw an exception. + */ + @Override + public void resetTaskCompleted() + { + this.lastTaskCompleted = true; + } - } catch (ArticleReaderException e) { - throw e; - } catch (Exception e) { - throw new ArticleReaderException(e); + /** + * Returns the number of bytes that the ArticleReader has processed. + * + * @return number of bytes (current position in the file / archive) + */ + @Override + public long getBytePosition() + { + return this.bytePosition; } - } - - /** - * Resets the task processing status of the ArticleReader. - *

- * This method has to be called if the hasNext() or next() methods throw an - * exception. - */ - @Override - public void resetTaskCompleted() { - this.lastTaskCompleted = true; - } - - /** - * Returns the number of bytes that the ArticleReader has processed. - * - * @return number of bytes (current position in the file / archive) - */ - @Override - public long getBytePosition() { - return this.bytePosition; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java index 7c2c571e..0d991508 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/DiffCalculatorInterface.java @@ -27,42 +27,44 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.Task; /** - * The DiffCalculatorInterface represents the interface to the diff processing - * unit. + * The DiffCalculatorInterface represents the interface to the diff processing unit. *

- * Please notice that there is no default method to return the generated diff. - * The current implementation uses the TaskTransmitterInterface (given as - * parameter of the constructor) to send the diffed data to the DiffProducer. + * Please notice that there is no default method to return the generated diff. The current + * implementation uses the TaskTransmitterInterface (given as parameter of the constructor) to send + * the diffed data to the DiffProducer. */ -public interface DiffCalculatorInterface { +public interface DiffCalculatorInterface +{ - /** - * This method process the given task to generate the diff. - * - * @param task RevisionTask - * @throws DiffException if the diff process fails - * @throws TimeoutException if the TaskTransmitter times out during the transmission of - * the task to the DiffProducer. - * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not - * supported by JAVA. - */ - void process(final Task task) throws DiffException, TimeoutException, UnsupportedEncodingException; + /** + * This method process the given task to generate the diff. + * + * @param task + * RevisionTask + * @throws DiffException + * if the diff process fails + * @throws TimeoutException + * if the TaskTransmitter times out during the transmission of the task to the + * DiffProducer. + * @throws UnsupportedEncodingException + * if the CharacterSet defined in the configuration is not supported by JAVA. + */ + void process(final Task task) + throws DiffException, TimeoutException, UnsupportedEncodingException; - /** - * This method is used to delete all information concerning the partial task - * processing. - *

- * This method has to be called if the process method throws an exception. - */ - void reset(); + /** + * This method is used to delete all information concerning the partial task processing. + *

+ * This method has to be called if the process method throws an exception. + */ + void reset(); - - /** - * Close Stream of Transmitter - * - * @throws IOException - * @throws SQLException - */ - void closeTransmitter() throws IOException, SQLException; + /** + * Close Stream of Transmitter + * + * @throws IOException + * @throws SQLException + */ + void closeTransmitter() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java index 191f6f52..3e9f8779 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/TaskTransmitterInterface.java @@ -25,40 +25,39 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.content.Diff; /** - * The TaskTransmitterInterface handles the transmission of DiffTasks to the - * DiffProducer. + * The TaskTransmitterInterface handles the transmission of DiffTasks to the DiffProducer. */ -public interface TaskTransmitterInterface { - - /** - * Sends the given task to the DiffProducer - FullTaskPool. - * - * @param result DiffTask of type TaskTypes.FULL_TASK or - * TaskTypes.PARTIAL_TASK_FIRST - * @throws TimeoutException if the TaskTransmitter times out during the transmission of - * the task to the DiffProducer. - */ - void transmitDiff(final Task result) - throws TimeoutException; - - /** - * Sends the given task to the DiffProducer - PartialTaskPool. - * - * @param result DiffTask of type TaskTypes.PARTIAL_TASK or - * TaskTypes.PARTIAL_TASK_LAST - * @throws TimeoutException if the TaskTransmitter times out during the transmission of - * the task to the DiffProducer. - */ - void transmitPartialDiff(final Task result) - throws TimeoutException; - - - /** - * Close stream - * - * @throws IOException - * @throws SQLException - */ - void close() throws IOException, SQLException; +public interface TaskTransmitterInterface +{ + + /** + * Sends the given task to the DiffProducer - FullTaskPool. + * + * @param result + * DiffTask of type TaskTypes.FULL_TASK or TaskTypes.PARTIAL_TASK_FIRST + * @throws TimeoutException + * if the TaskTransmitter times out during the transmission of the task to the + * DiffProducer. + */ + void transmitDiff(final Task result) throws TimeoutException; + + /** + * Sends the given task to the DiffProducer - PartialTaskPool. + * + * @param result + * DiffTask of type TaskTypes.PARTIAL_TASK or TaskTypes.PARTIAL_TASK_LAST + * @throws TimeoutException + * if the TaskTransmitter times out during the transmission of the task to the + * DiffProducer. + */ + void transmitPartialDiff(final Task result) throws TimeoutException; + + /** + * Close stream + * + * @throws IOException + * @throws SQLException + */ + void close() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java index f94ee67b..10376057 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagement.java @@ -31,289 +31,323 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.content.DiffPart; /** - * The BlockManagement class is used to calculate the diff operations using the - * blocks of the longest common substring search. + * The BlockManagement class is used to calculate the diff operations using the blocks of the + * longest common substring search. */ -public class BlockManagement implements BlockManagementInterface { +public class BlockManagement + implements BlockManagementInterface +{ + + /** + * Configuration parameter - Charset name of the input data + */ + private static String WIKIPEDIA_ENCODING; + + /** + * Temporary variable - Just in Time revision + */ + private StringBuilder version; + + /** + * Temporary variable - Diff + */ + private Diff diff; + + /** + * Temporary variable - Storage for intermediate blocks + */ + private Map bufferMap; + + /** + * Reference to the codec + */ + private RevisionCodecData codecData; + + /** + * (Constructor) Creates a BlockManagement object. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public BlockManagement() throws ConfigurationException + { + + ConfigurationManager config = ConfigurationManager.getInstance(); + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - /** - * Configuration parameter - Charset name of the input data - */ - private static String WIKIPEDIA_ENCODING; - - /** - * Temporary variable - Just in Time revision - */ - private StringBuilder version; - - /** - * Temporary variable - Diff - */ - private Diff diff; - - /** - * Temporary variable - Storage for intermediate blocks - */ - private Map bufferMap; - - /** - * Reference to the codec - */ - private RevisionCodecData codecData; - - /** - * (Constructor) Creates a BlockManagement object. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public BlockManagement() throws ConfigurationException { - - ConfigurationManager config = ConfigurationManager.getInstance(); - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + } - } + @Override + public Diff manage(char[] revA, char[] revB, ArrayList queueA, + ArrayList queueB) + throws UnsupportedEncodingException + { - @Override - public Diff manage(char[] revA, char[] revB, ArrayList queueA, - ArrayList queueB) throws UnsupportedEncodingException { + this.diff = new Diff(); + this.codecData = new RevisionCodecData(); - this.diff = new Diff(); - this.codecData = new RevisionCodecData(); + this.bufferMap = new HashMap<>(); + this.version = new StringBuilder(); - this.bufferMap = new HashMap<>(); - this.version = new StringBuilder(); + DiffBlock curA = null, curB = null; + while (!queueA.isEmpty() || !queueB.isEmpty() || curB != null) { - DiffBlock curA = null, curB = null; - while (!queueA.isEmpty() || !queueB.isEmpty() || curB != null) { + if (!queueA.isEmpty() && curA == null) { + curA = queueA.remove(0); + } + if (!queueB.isEmpty() && curB == null) { + curB = queueB.remove(0); + } - if (!queueA.isEmpty() && curA == null) { - curA = queueA.remove(0); - } - if (!queueB.isEmpty() && curB == null) { - curB = queueB.remove(0); - } + if (curA != null && curB != null) { - if (curA != null && curB != null) { + if (curA.getId() == curB.getId()) { - if (curA.getId() == curB.getId()) { + if (curA.getId() == -1) { + replace(revA, revB, curA, curB); + } + else { + version.append(copy(revA, curA.getRevAStart(), curA.getRevAEnd())); + } - if (curA.getId() == -1) { - replace(revA, revB, curA, curB); - } else { - version.append(copy(revA, curA.getRevAStart(), - curA.getRevAEnd())); - } + curA = null; + curB = null; - curA = null; - curB = null; + } + else if (curA.getId() == -1) { - } else if (curA.getId() == -1) { + delete(curA); + curA = null; - delete(curA); - curA = null; + } + else if (curB.getId() == -1) { - } else if (curB.getId() == -1) { + insert(revB, curB); + curB = null; - insert(revB, curB); - curB = null; + } + else { - } else { + // Difference :( + if (bufferMap.containsKey(curB.getId())) { - // Difference :( - if (bufferMap.containsKey(curB.getId())) { + paste(curB); + curB = null; - paste(curB); - curB = null; + } + else { - } else { + cut(revA, curA); + curA = null; - cut(revA, curA); - curA = null; + // System.out.println("@TO CUT: " + curA.getId() + "\t<" + // + text + ">"); + } + } - // System.out.println("@TO CUT: " + curA.getId() + "\t<" - // + text + ">"); - } - } + } + else if (curA != null) { - } else if (curA != null) { + delete(curA); + curA = null; - delete(curA); - curA = null; + } + else if (curB != null) { - } else if (curB != null) { + // Difference :( + if (bufferMap.containsKey(curB.getId())) { - // Difference :( - if (bufferMap.containsKey(curB.getId())) { + paste(curB); + curB = null; - paste(curB); - curB = null; + } + else { - } else { + insert(revB, curB); + curB = null; + } - insert(revB, curB); - curB = null; + } + else { + System.err.println("INVALID CASE"); + System.exit(-1); + } } - } else { - System.err.println("INVALID CASE"); - System.exit(-1); - } - } - - diff.setCodecData(codecData); - return diff; - } - - /*-PRIVATE-METHODS----------------------------------------------------------*/ - - /** - * Copies the specified interval of characters for the array. - * - * @return specified interval - */ - private String copy(final char[] array, final int start, final int end) { - StringBuilder text = new StringBuilder(); - for (int j = start; j < end; j++) { - text.append(array[j]); + diff.setCodecData(codecData); + return diff; } - return text.toString(); - } - - /** - * Creates an insert operation. - * - * @param revB revision B - * @param curB Reference to the block B - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - private void insert(final char[] revB, final DiffBlock curB) throws UnsupportedEncodingException { - - String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); - - // Insert (C S L T) - DiffPart action = new DiffPart(DiffAction.INSERT); - - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); - - // L T - action.setText(text); - codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - - diff.add(action); + /*-PRIVATE-METHODS----------------------------------------------------------*/ + + /** + * Copies the specified interval of characters for the array. + * + * @return specified interval + */ + private String copy(final char[] array, final int start, final int end) + { + StringBuilder text = new StringBuilder(); + for (int j = start; j < end; j++) { + text.append(array[j]); + } - version.append(text); - } + return text.toString(); + } - /** - * Creates a delete operation. - * - * @param curA Reference to the block A - */ - private void delete(final DiffBlock curA) { + /** + * Creates an insert operation. + * + * @param revB + * revision B + * @param curB + * Reference to the block B + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + private void insert(final char[] revB, final DiffBlock curB) throws UnsupportedEncodingException + { - // Delete (C S E) - DiffPart action = new DiffPart(DiffAction.DELETE); + String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); + // Insert (C S L T) + DiffPart action = new DiffPart(DiffAction.INSERT); - // E - action.setLength(curA.getRevAEnd() - curA.getRevAStart()); - codecData.checkBlocksizeE(action.getLength()); + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); - diff.add(action); - } + // L T + action.setText(text); + codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - /** - * Creates a replace operation. - * - * @param revA Reference to revision A - * @param revB Reference to revision B - * @param curA Reference to current block A - * @param curB Reference to current block B - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - private void replace(final char[] revA, final char[] revB, final DiffBlock curA, final DiffBlock curB) - throws UnsupportedEncodingException { + diff.add(action); - // Replace (C S E L T) - String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); + version.append(text); + } - DiffPart action = new DiffPart(DiffAction.REPLACE); + /** + * Creates a delete operation. + * + * @param curA + * Reference to the block A + */ + private void delete(final DiffBlock curA) + { - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); + // Delete (C S E) + DiffPart action = new DiffPart(DiffAction.DELETE); - // E - action.setLength(curA.getRevAEnd() - curA.getRevAStart()); - codecData.checkBlocksizeE(action.getLength()); + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); - // L T - action.setText(text); - codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); + // E + action.setLength(curA.getRevAEnd() - curA.getRevAStart()); + codecData.checkBlocksizeE(action.getLength()); - diff.add(action); + diff.add(action); + } - version.append(text); - } + /** + * Creates a replace operation. + * + * @param revA + * Reference to revision A + * @param revB + * Reference to revision B + * @param curA + * Reference to current block A + * @param curB + * Reference to current block B + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + private void replace(final char[] revA, final char[] revB, final DiffBlock curA, + final DiffBlock curB) + throws UnsupportedEncodingException + { + + // Replace (C S E L T) + String text = copy(revB, curB.getRevBStart(), curB.getRevBEnd()); + + DiffPart action = new DiffPart(DiffAction.REPLACE); + + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); + + // E + action.setLength(curA.getRevAEnd() - curA.getRevAStart()); + codecData.checkBlocksizeE(action.getLength()); + + // L T + action.setText(text); + codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); + + diff.add(action); + + version.append(text); + } - /** - * Creates a cut operation. - * - * @param revA Reference to revision A - * @param curA Reference to current block A - */ - private void cut(final char[] revA, final DiffBlock curA) { + /** + * Creates a cut operation. + * + * @param revA + * Reference to revision A + * @param curA + * Reference to current block A + */ + private void cut(final char[] revA, final DiffBlock curA) + { - String text = copy(revA, curA.getRevAStart(), curA.getRevAEnd()); + String text = copy(revA, curA.getRevAStart(), curA.getRevAEnd()); - // Cut (C S E B) - DiffPart action = new DiffPart(DiffAction.CUT); + // Cut (C S E B) + DiffPart action = new DiffPart(DiffAction.CUT); - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); - // E - action.setLength(curA.getRevAEnd() - curA.getRevAStart()); - codecData.checkBlocksizeE(action.getLength()); + // E + action.setLength(curA.getRevAEnd() - curA.getRevAStart()); + codecData.checkBlocksizeE(action.getLength()); - // B - action.setText(Integer.toString(curA.getId())); - codecData.checkBlocksizeB(curA.getId()); + // B + action.setText(Integer.toString(curA.getId())); + codecData.checkBlocksizeB(curA.getId()); - diff.add(action); + diff.add(action); - bufferMap.put(curA.getId(), text); - } + bufferMap.put(curA.getId(), text); + } - /** - * Creates a paste operation. - * - * @param curB Reference to current block B - */ - private void paste(final DiffBlock curB) { + /** + * Creates a paste operation. + * + * @param curB + * Reference to current block B + */ + private void paste(final DiffBlock curB) + { - String text = bufferMap.remove(curB.getId()); + String text = bufferMap.remove(curB.getId()); - // Paste (C S B) - DiffPart action = new DiffPart(DiffAction.PASTE); + // Paste (C S B) + DiffPart action = new DiffPart(DiffAction.PASTE); - // S - action.setStart(version.length()); - codecData.checkBlocksizeS(version.length()); + // S + action.setStart(version.length()); + codecData.checkBlocksizeS(version.length()); - // B - action.setText(Integer.toString(curB.getId())); - codecData.checkBlocksizeB(curB.getId()); + // B + action.setText(Integer.toString(curB.getId())); + codecData.checkBlocksizeB(curB.getId()); - diff.add(action); + diff.add(action); - version.append(text); - } + version.append(text); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java index 61423e85..1a1a7252 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/BlockManagementInterface.java @@ -25,19 +25,26 @@ /** * Interface of the BlockManagement */ -public interface BlockManagementInterface { +public interface BlockManagementInterface +{ - /** - * Uses the substring blocks to create the diff operations. - * - * @param revA revision A - * @param revB revision B - * @param queueA queue A - * @param queueB queue B - * @return Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - Diff manage(final char[] revA, final char[] revB, final ArrayList queueA, final ArrayList queueB) - throws UnsupportedEncodingException; + /** + * Uses the substring blocks to create the diff operations. + * + * @param revA + * revision A + * @param revB + * revision B + * @param queueA + * queue A + * @param queueB + * queue B + * @return Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + Diff manage(final char[] revA, final char[] revB, final ArrayList queueA, + final ArrayList queueB) + throws UnsupportedEncodingException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java index 566bb543..902cab8a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffBlock.java @@ -20,132 +20,151 @@ /** * Contains the information for a block. Used for the Diff Calculation. */ -public class DiffBlock implements Comparable { - - /** - * Block ID - */ - private final int id; - - /** - * Start position in revision A - */ - private final int revAStart; - - /** - * End position in revision A - */ - private final int revAEnd; - - /** - * Start position in revision B - */ - private final int revBStart; - - /** - * End position in revision B - */ - private final int revBEnd; - - /** - * Flag, indicating the sorting order TRUE sorting after the start position - * of revision A FALSE sorting after the start position of revision B - */ - private final boolean ab; - - /** - * (DiffBlock) Creates a new DiffBlock. - * - * @param id ID of the block - * @param revAStart start position of revision A - * @param revAEnd end position of revision A - * @param revBStart start position of revision B - * @param revBEnd end position of revision B - * @param ab sorting order flag - */ - public DiffBlock(final int id, final int revAStart, final int revAEnd, - final int revBStart, final int revBEnd, final boolean ab) { - this.id = id; - this.revAStart = revAStart; - this.revAEnd = revAEnd; - this.revBStart = revBStart; - this.revBEnd = revBEnd; - this.ab = ab; - } - - /** - * Compares the positions of both blocks. - * - * @param b Block - */ - public int compareTo(final DiffBlock b) { - if (ab) { - return this.revAStart - b.revAStart; - } else { - return this.revBStart - b.revBStart; +public class DiffBlock + implements Comparable +{ + + /** + * Block ID + */ + private final int id; + + /** + * Start position in revision A + */ + private final int revAStart; + + /** + * End position in revision A + */ + private final int revAEnd; + + /** + * Start position in revision B + */ + private final int revBStart; + + /** + * End position in revision B + */ + private final int revBEnd; + + /** + * Flag, indicating the sorting order TRUE sorting after the start position of revision A FALSE + * sorting after the start position of revision B + */ + private final boolean ab; + + /** + * (DiffBlock) Creates a new DiffBlock. + * + * @param id + * ID of the block + * @param revAStart + * start position of revision A + * @param revAEnd + * end position of revision A + * @param revBStart + * start position of revision B + * @param revBEnd + * end position of revision B + * @param ab + * sorting order flag + */ + public DiffBlock(final int id, final int revAStart, final int revAEnd, final int revBStart, + final int revBEnd, final boolean ab) + { + this.id = id; + this.revAStart = revAStart; + this.revAEnd = revAEnd; + this.revBStart = revBStart; + this.revBEnd = revBEnd; + this.ab = ab; + } + + /** + * Compares the positions of both blocks. + * + * @param b + * Block + */ + public int compareTo(final DiffBlock b) + { + if (ab) { + return this.revAStart - b.revAStart; + } + else { + return this.revBStart - b.revBStart; + } + } + + /** + * Returns whether the block is valid or not. + * + * @return TRUE if the block has a ID of the value -1 FALSE otherwise + */ + public boolean isUnknown() + { + return (id == -1); + } + + /** + * Returns the ID of this block. + * + * @return string representation + */ + public String toString() + { + return Integer.toString(id); + } + + /** + * Returns the ID of this block. + * + * @return ID of this block + */ + public int getId() + { + return id; + } + + /** + * Returns the end position of the block in revision A. + * + * @return end position revision A + */ + public int getRevAEnd() + { + return revAEnd; + } + + /** + * Returns the start position of the block in revision A. + * + * @return start position revision A + */ + public int getRevAStart() + { + return revAStart; + } + + /** + * Returns the end position of the block in revision B. + * + * @return end position revision B + */ + public int getRevBEnd() + { + return revBEnd; + } + + /** + * Returns the start position of the block in revision B. + * + * @return start position revision B + */ + public int getRevBStart() + { + return revBStart; } - } - - /** - * Returns whether the block is valid or not. - * - * @return TRUE if the block has a ID of the value -1 FALSE otherwise - */ - public boolean isUnknown() { - return (id == -1); - } - - /** - * Returns the ID of this block. - * - * @return string representation - */ - public String toString() { - return Integer.toString(id); - } - - /** - * Returns the ID of this block. - * - * @return ID of this block - */ - public int getId() { - return id; - } - - /** - * Returns the end position of the block in revision A. - * - * @return end position revision A - */ - public int getRevAEnd() { - return revAEnd; - } - - /** - * Returns the start position of the block in revision A. - * - * @return start position revision A - */ - public int getRevAStart() { - return revAStart; - } - - /** - * Returns the end position of the block in revision B. - * - * @return end position revision B - */ - public int getRevBEnd() { - return revBEnd; - } - - /** - * Returns the start position of the block in revision B. - * - * @return start position revision B - */ - public int getRevBStart() { - return revBStart; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java index 07532457..1f133e84 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/DiffCalculator.java @@ -47,681 +47,715 @@ /** * Calculates the Diff. */ -public class DiffCalculator implements DiffCalculatorInterface { - - /** - * Configuration parameter - Flag, which indicates whether debug output is - * enabled or not - */ - private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; - - /** - * Configuration parameter - Path for the DiffTool logger - */ - private final String LOGGING_PATH_DIFFTOOL; - - /** - * Configuration parameter - Path for the debug logger - */ - private final String LOGGING_PATH_DEBUG; - - /** - * Configuration parameter - Each x-th version is a full revision - */ - private final int COUNTER_FULL_REVISION; - - /** - * Configuration parameter - Maximum size of a diff statement - */ - private final long LIMIT_TASK_SIZE_DIFFS; - - /** - * Configuration parameter - Charset name of the input data - */ - private final String WIKIPEDIA_ENCODING; - - /** - * Configuration parameter - Flag, which indicates whether the verification - * of the diff is enabled or not - */ - private final boolean VERIFICATION_DIFF; - - /** - * Configuration parameter - Value of the minimum legal substring - */ - private final int VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING; - - /** - * Configuration parameter - Surrogate Mode - */ - private final SurrogateModes MODE_SURROGATES; - - /** - * Reference to the TransTransmitter - */ - private final TaskTransmitterInterface taskTransmitter; - - /** - * Reference to the BlockManager - */ - private final BlockManagementInterface blocks; - - @Override - public void closeTransmitter() throws IOException, SQLException { - this.taskTransmitter.close(); - } - - /** - * (Constructor) Creates a new DiffCalculator object. - * - * @param taskTransmitter Reference to the TaskTransmitter - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public DiffCalculator(final TaskTransmitterInterface taskTransmitter) throws ConfigurationException { - this.taskTransmitter = taskTransmitter; - this.blocks = new BlockManagement(); - - this.articleID = -1; - this.partCounter = 0; - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); - - LOGGING_PATH_DIFFTOOL = (String) config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - LOGGING_PATH_DEBUG = (String) config.getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); - - COUNTER_FULL_REVISION = (Integer) config.getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); - LIMIT_TASK_SIZE_DIFFS = (Long) config.getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - VERIFICATION_DIFF = (Boolean) config.getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); - - VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = (Integer) config - .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); - MODE_SURROGATES = (SurrogateModes) config - .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Temporary variable - ID of the currently processed article - */ - private int articleID; - - /** - * Temporary variable - Storage for the diffs - */ - private Task result; - - /** - * Temporary variable - Revision Counter - */ - private int revisionCounter; - - /** - * Temporary variable - Part Counter - */ - private int partCounter; - - /** - * Temporary variable - Diff Part - */ - private DiffPart part; - - /** - * Temporary variable - content - */ - private String text; - - /** - * Temporary variable - previous revision - */ - private char[] revPrevious; - - /** - * Temporary variable - current revision - */ - private char[] revCurrent; - - /** - * Temporary variable - temporary revision - */ - private char[] revTemp; - - /** - * Temporary variable - Block Counter - */ - private int blockCount; - - /** - * Temporary variable - Used to mark used characters of the previous - * revision - */ - private boolean[] revABlocked; - - /** - * Temporary variable - Used to mark used characters of the current revision - */ - private boolean[] revBBlocked; - - /** - * Temporary variable - Mapping of characters and their positions in the - * previous revision - */ - private HashMap> positions; - - /** - * Temporary variable - Queue for blocks of the previous revision - */ - private ArrayList queueA; - - /** - * Temporary variable - Queue for blocks of the current revision - */ - private ArrayList queueB; - - /** - * Temporary variable - size of the longest matching substring - */ - private int longestMatch_size; - - /** - * Temporary variable - start position of the longest matching substring - */ - private int longestMatch_start; - - /*--------------------------------------------------------------------------*/ - - /** - * Initializes the processing of a RevisionTask using a new DiffTask. - * - * @param task Reference to the DiffTask - */ - private void init(final Task task) { - this.partCounter++; - this.result = new Task<>(task.getHeader(), partCounter); - } - - /** - * Initializes the processing of a new RevisionTask. - * - * @param taskID Article ID - */ - protected void initNewTask(final int taskID) { - - this.articleID = taskID; - - this.partCounter = 0; - this.revisionCounter = 0; - - this.revPrevious = null; - this.revCurrent = null; - } - - /** - * Generates a FullRevision. - * - * @param revision Reference to the revision - * @return Diff, containing a FullRevision - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - private Diff generateFullRevision(final Revision revision) - throws UnsupportedEncodingException { - - Diff diff = new Diff(); - RevisionCodecData codecData = new RevisionCodecData(); - - // FullRevisionUncompressed (C L T) - part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); - - // L T - text = revision.getRevisionText(); - revCurrent = text.toCharArray(); - - part.setText(text); - codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - - diff.add(part); - - diff.setCodecData(codecData); - return diff; - } - - /** - * Transmits a partial DiffTask. - * - * @param result Reference to the DiffTask - * @throws TimeoutException if a timeout occurred - */ - protected void transmitPartialTask(final Task result) - throws TimeoutException { - - if (this.partCounter == 1) { - - this.result.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); - this.taskTransmitter.transmitDiff(result); - - } else { - - this.result.setTaskType(TaskTypes.TASK_PARTIAL); - this.taskTransmitter.transmitPartialDiff(result); +public class DiffCalculator + implements DiffCalculatorInterface +{ + + /** + * Configuration parameter - Flag, which indicates whether debug output is enabled or not + */ + private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; + + /** + * Configuration parameter - Path for the DiffTool logger + */ + private final String LOGGING_PATH_DIFFTOOL; + + /** + * Configuration parameter - Path for the debug logger + */ + private final String LOGGING_PATH_DEBUG; + + /** + * Configuration parameter - Each x-th version is a full revision + */ + private final int COUNTER_FULL_REVISION; + + /** + * Configuration parameter - Maximum size of a diff statement + */ + private final long LIMIT_TASK_SIZE_DIFFS; + + /** + * Configuration parameter - Charset name of the input data + */ + private final String WIKIPEDIA_ENCODING; + + /** + * Configuration parameter - Flag, which indicates whether the verification of the diff is + * enabled or not + */ + private final boolean VERIFICATION_DIFF; + + /** + * Configuration parameter - Value of the minimum legal substring + */ + private final int VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING; + + /** + * Configuration parameter - Surrogate Mode + */ + private final SurrogateModes MODE_SURROGATES; + + /** + * Reference to the TransTransmitter + */ + private final TaskTransmitterInterface taskTransmitter; + + /** + * Reference to the BlockManager + */ + private final BlockManagementInterface blocks; + + @Override + public void closeTransmitter() throws IOException, SQLException + { + this.taskTransmitter.close(); } - } - - /** - * Transmits the DiffTask at the end of the RevisionTask processing. - * - * @param task Reference to the RevisionTask - * @param result Reference to the DiffTask - * @throws TimeoutException if a timeout occurred - */ - protected void transmitAtEndOfTask(final Task task, - final Task result) - throws TimeoutException { - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - if (this.partCounter > 1) { - this.result.setTaskType(TaskTypes.TASK_PARTIAL_LAST); - this.taskTransmitter.transmitPartialDiff(result); - } else { - this.result.setTaskType(TaskTypes.TASK_FULL); - this.taskTransmitter.transmitDiff(result); - } - - this.result = null; + + /** + * (Constructor) Creates a new DiffCalculator object. + * + * @param taskTransmitter + * Reference to the TaskTransmitter + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public DiffCalculator(final TaskTransmitterInterface taskTransmitter) + throws ConfigurationException + { + this.taskTransmitter = taskTransmitter; + this.blocks = new BlockManagement(); + + this.articleID = -1; + this.partCounter = 0; + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); + + LOGGING_PATH_DIFFTOOL = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + LOGGING_PATH_DEBUG = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); + + COUNTER_FULL_REVISION = (Integer) config + .getConfigParameter(ConfigurationKeys.COUNTER_FULL_REVISION); + LIMIT_TASK_SIZE_DIFFS = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_TASK_SIZE_DIFFS); + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + VERIFICATION_DIFF = (Boolean) config + .getConfigParameter(ConfigurationKeys.VERIFICATION_DIFF); + + VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING = (Integer) config + .getConfigParameter(ConfigurationKeys.VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING); + MODE_SURROGATES = (SurrogateModes) config + .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); } - } - - /** - * Calculates the diff for the given revision. - * - * @param revision Reference to a revision - * @return Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - protected Diff processRevision(final Revision revision) - throws UnsupportedEncodingException { - - // ----------------------------------------------------// - // ** HERE IS THE POINT TO INCLUDE ADDITIONAL FILTERS // - // TO REMOVE FAULTY REVISIONS FROM FURTHER PROCESSING // - // ----------------------------------------------------// - - try { - if (revision.getRevisionText() == null) { - return null; - } - } catch (NullPointerException e) { - return null; + + /*--------------------------------------------------------------------------*/ + + /** + * Temporary variable - ID of the currently processed article + */ + private int articleID; + + /** + * Temporary variable - Storage for the diffs + */ + private Task result; + + /** + * Temporary variable - Revision Counter + */ + private int revisionCounter; + + /** + * Temporary variable - Part Counter + */ + private int partCounter; + + /** + * Temporary variable - Diff Part + */ + private DiffPart part; + + /** + * Temporary variable - content + */ + private String text; + + /** + * Temporary variable - previous revision + */ + private char[] revPrevious; + + /** + * Temporary variable - current revision + */ + private char[] revCurrent; + + /** + * Temporary variable - temporary revision + */ + private char[] revTemp; + + /** + * Temporary variable - Block Counter + */ + private int blockCount; + + /** + * Temporary variable - Used to mark used characters of the previous revision + */ + private boolean[] revABlocked; + + /** + * Temporary variable - Used to mark used characters of the current revision + */ + private boolean[] revBBlocked; + + /** + * Temporary variable - Mapping of characters and their positions in the previous revision + */ + private HashMap> positions; + + /** + * Temporary variable - Queue for blocks of the previous revision + */ + private ArrayList queueA; + + /** + * Temporary variable - Queue for blocks of the current revision + */ + private ArrayList queueB; + + /** + * Temporary variable - size of the longest matching substring + */ + private int longestMatch_size; + + /** + * Temporary variable - start position of the longest matching substring + */ + private int longestMatch_start; + + /*--------------------------------------------------------------------------*/ + + /** + * Initializes the processing of a RevisionTask using a new DiffTask. + * + * @param task + * Reference to the DiffTask + */ + private void init(final Task task) + { + this.partCounter++; + this.result = new Task<>(task.getHeader(), partCounter); } - revTemp = revision.getRevisionText().toCharArray(); + /** + * Initializes the processing of a new RevisionTask. + * + * @param taskID + * Article ID + */ + protected void initNewTask(final int taskID) + { - if (MODE_SURROGATES == SurrogateModes.DISCARD_REVISION) { + this.articleID = taskID; - // Ignore Revision with surrogate characters - if (Surrogates.scan(revTemp)) { - return null; - } + this.partCounter = 0; + this.revisionCounter = 0; + + this.revPrevious = null; + this.revCurrent = null; } - Diff diff; + /** + * Generates a FullRevision. + * + * @param revision + * Reference to the revision + * @return Diff, containing a FullRevision + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + private Diff generateFullRevision(final Revision revision) throws UnsupportedEncodingException + { - // Full revision - if (revisionCounter % COUNTER_FULL_REVISION == 0) { + Diff diff = new Diff(); + RevisionCodecData codecData = new RevisionCodecData(); - diff = generateFullRevision(revision); + // FullRevisionUncompressed (C L T) + part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); - // Diffed revision - } else { + // L T + text = revision.getRevisionText(); + revCurrent = text.toCharArray(); - diff = generateDiff(revPrevious, revTemp); + part.setText(text); + codecData.checkBlocksizeL(text.getBytes(WIKIPEDIA_ENCODING).length); - // if the current revision is identical to the last valid revision - if (diff.size() == 0) { - return null; - } - } + diff.add(part); - return diff; - } + diff.setCodecData(codecData); + return diff; + } - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface - * #process(de.tud.ukp.kulessa.delta.data.Task) - */ - public void process(final Task task) - throws DiffException, TimeoutException, UnsupportedEncodingException { - // this.startTime = System.currentTimeMillis(); - Revision revision; + /** + * Transmits a partial DiffTask. + * + * @param result + * Reference to the DiffTask + * @throws TimeoutException + * if a timeout occurred + */ + protected void transmitPartialTask(final Task result) throws TimeoutException + { - // check if a new task was received - if (articleID != task.getHeader().getArticleId()) { + if (this.partCounter == 1) { - // init settings - initNewTask(task.getHeader().getArticleId()); - init(task); + this.result.setTaskType(TaskTypes.TASK_PARTIAL_FIRST); + this.taskTransmitter.transmitDiff(result); - // check if old task was complete - } else if (result == null) { + } + else { - init(task); + this.result.setTaskType(TaskTypes.TASK_PARTIAL); + this.taskTransmitter.transmitPartialDiff(result); + } } - Diff diff; - - // TODO: Chronological order hotfix - - // does not work for articles that are split across multiple tasks - ArrayList list = task.getContainer(); - Collections.sort(list); - - int i, rSize = list.size(); - - for (i = 0; i < rSize; i++) { - - if (result.byteSize() > LIMIT_TASK_SIZE_DIFFS) { - - transmitPartialTask(result); - init(task); - } - - // Store previous revision - revPrevious = revCurrent; - - // Process next revision - revision = list.get(i); - - diff = processRevision(revision); - - if (diff != null) { - - revCurrent = revTemp; - - // Add to result - revisionCounter++; - - diff.setRevisionCoutner(revisionCounter); - diff.setRevisionID(revision.getRevisionID()); - diff.setTimeStamp(revision.getTimeStamp()); - diff.setComment(revision.getComment()); - diff.setContributorName(revision.getContributorName()); - diff.setContributorId(revision.getContributorId()); - diff.setContributorIsRegistered(revision.contributorIsRegistered()); - diff.setMinor(revision.isMinor()); - - result.add(diff); - - // Verification - if (VERIFICATION_DIFF) { - String revC, revP; - try { - revC = String.valueOf(revCurrent); - revP = diff.buildRevision(revPrevious); - - /* - * WRONG LOCATION if (notEqual && MODE_SURROGATES == - * SurrogateModes.REPLACE) { - * - * // TODO: TEST: if (Surrogates.scan(revCurrent)) { - * - * char[] repCurrent = Surrogates.replace(revCurrent); - * char[] repPrevious = Surrogates.replace(revPrevious); - * - * revC = String.valueOf(repCurrent); revP = - * diff.buildRevision(repPrevious); - * - * notEqual = !revC.equals(revP); } } - */ - - if (!revC.equals(revP)) { - - if (MODE_DEBUG_OUTPUT_ACTIVATED) { - WikipediaXMLWriter writer = new WikipediaXMLWriter( - LOGGING_PATH_DIFFTOOL - + LOGGING_PATH_DEBUG - + task.getHeader() - .getArticleName() - + ".dbg"); - - writer.writeRevision(task); - writer.close(); - } - - throw ErrorFactory - .createDiffException( - ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, - "Reconstruction of " - + task - + " failed at revision " - + revisionCounter + "."); + /** + * Transmits the DiffTask at the end of the RevisionTask processing. + * + * @param task + * Reference to the RevisionTask + * @param result + * Reference to the DiffTask + * @throws TimeoutException + * if a timeout occurred + */ + protected void transmitAtEndOfTask(final Task task, final Task result) + throws TimeoutException + { + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.partCounter > 1) { + this.result.setTaskType(TaskTypes.TASK_PARTIAL_LAST); + this.taskTransmitter.transmitPartialDiff(result); + } + else { + this.result.setTaskType(TaskTypes.TASK_FULL); + this.taskTransmitter.transmitDiff(result); } - // Throw again - } catch (DiffException e) { - throw e; - - // Catch unexpected exceptions - } catch (Exception e) { - throw ErrorFactory - .createDiffException( - ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, - "Reconstruction of " + task - + " failed at revision " - + revisionCounter + ".", e); - } + this.result = null; } - } } - transmitAtEndOfTask(task, result); - } - - /** - * Generates a Diff by using the CommonLongestSubstring search. - * - * @param revA previous revision - * @param revB current revision - * @return Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - private Diff generateDiff(final char[] revA, final char[] revB) - throws UnsupportedEncodingException { - - blockCount = 0; - queueA = new ArrayList<>(); - queueB = new ArrayList<>(); - - revABlocked = new boolean[revA.length]; - revBBlocked = new boolean[revB.length]; - - int revAStartIndex = 0, revAEndIndex = revA.length - 1; - int revBStartIndex = 0, revBEndIndex = revB.length - 1; - - while (revAStartIndex <= revAEndIndex && revBStartIndex <= revBEndIndex - && revA[revAStartIndex] == revB[revBStartIndex]) { - - revABlocked[revAStartIndex] = true; - revBBlocked[revBStartIndex] = true; - revAStartIndex++; - revBStartIndex++; - } + /** + * Calculates the diff for the given revision. + * + * @param revision + * Reference to a revision + * @return Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + protected Diff processRevision(final Revision revision) throws UnsupportedEncodingException + { + + // ----------------------------------------------------// + // ** HERE IS THE POINT TO INCLUDE ADDITIONAL FILTERS // + // TO REMOVE FAULTY REVISIONS FROM FURTHER PROCESSING // + // ----------------------------------------------------// + + try { + if (revision.getRevisionText() == null) { + return null; + } + } + catch (NullPointerException e) { + return null; + } - // First Block - if (revAStartIndex != 0) { - queueA.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, - revBStartIndex, true)); - queueB.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, - revBStartIndex, false)); - this.blockCount++; - } + revTemp = revision.getRevisionText().toCharArray(); - while (revAStartIndex < revAEndIndex && revBStartIndex < revBEndIndex - && revA[revAEndIndex] == revB[revBEndIndex]) { + if (MODE_SURROGATES == SurrogateModes.DISCARD_REVISION) { - revABlocked[revAEndIndex] = true; - revBBlocked[revBEndIndex] = true; - revAEndIndex--; - revBEndIndex--; - } + // Ignore Revision with surrogate characters + if (Surrogates.scan(revTemp)) { + return null; + } + } - // Last Block - if (revAEndIndex + 1 != revA.length) { - queueA.add(new DiffBlock(this.blockCount, revAEndIndex + 1, - revA.length, revBEndIndex + 1, revB.length, true)); - queueB.add(new DiffBlock(this.blockCount, revAEndIndex + 1, - revA.length, revBEndIndex + 1, revB.length, false)); - this.blockCount++; - } + Diff diff; - scan(revA, revAStartIndex, revAEndIndex); + // Full revision + if (revisionCounter % COUNTER_FULL_REVISION == 0) { - ArrayList list; - char c; + diff = generateFullRevision(revision); - int i = revBStartIndex; - while (i < revBEndIndex) { + // Diffed revision + } + else { - c = revB[i]; - list = positions.get(c); + diff = generateDiff(revPrevious, revTemp); - if (list != null && findLongestMatch(revA, list, revB, i)) { + // if the current revision is identical to the last valid revision + if (diff.size() == 0) { + return null; + } + } - i += longestMatch_size; - } else { - i++; - } + return diff; } - int j; - for (i = revAStartIndex; i <= revAEndIndex; i++) { - if (!revABlocked[i]) { - j = i; - while (i + 1 <= revAEndIndex && !revABlocked[++i]) { + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.consumers.diff.calculation.DiffCalculatorInterface + * #process(de.tud.ukp.kulessa.delta.data.Task) + */ + public void process(final Task task) + throws DiffException, TimeoutException, UnsupportedEncodingException + { + // this.startTime = System.currentTimeMillis(); + Revision revision; + + // check if a new task was received + if (articleID != task.getHeader().getArticleId()) { + + // init settings + initNewTask(task.getHeader().getArticleId()); + init(task); + + // check if old task was complete + } + else if (result == null) { + + init(task); } - if (i + 1 > revAEndIndex) { - i++; + Diff diff; + + // TODO: Chronological order hotfix - + // does not work for articles that are split across multiple tasks + ArrayList list = task.getContainer(); + Collections.sort(list); + + int i, rSize = list.size(); + + for (i = 0; i < rSize; i++) { + + if (result.byteSize() > LIMIT_TASK_SIZE_DIFFS) { + + transmitPartialTask(result); + init(task); + } + + // Store previous revision + revPrevious = revCurrent; + + // Process next revision + revision = list.get(i); + + diff = processRevision(revision); + + if (diff != null) { + + revCurrent = revTemp; + + // Add to result + revisionCounter++; + + diff.setRevisionCoutner(revisionCounter); + diff.setRevisionID(revision.getRevisionID()); + diff.setTimeStamp(revision.getTimeStamp()); + diff.setComment(revision.getComment()); + diff.setContributorName(revision.getContributorName()); + diff.setContributorId(revision.getContributorId()); + diff.setContributorIsRegistered(revision.contributorIsRegistered()); + diff.setMinor(revision.isMinor()); + + result.add(diff); + + // Verification + if (VERIFICATION_DIFF) { + String revC, revP; + try { + revC = String.valueOf(revCurrent); + revP = diff.buildRevision(revPrevious); + + /* + * WRONG LOCATION if (notEqual && MODE_SURROGATES == SurrogateModes.REPLACE) + * { + * + * // TODO: TEST: if (Surrogates.scan(revCurrent)) { + * + * char[] repCurrent = Surrogates.replace(revCurrent); char[] repPrevious = + * Surrogates.replace(revPrevious); + * + * revC = String.valueOf(repCurrent); revP = + * diff.buildRevision(repPrevious); + * + * notEqual = !revC.equals(revP); } } + */ + + if (!revC.equals(revP)) { + + if (MODE_DEBUG_OUTPUT_ACTIVATED) { + WikipediaXMLWriter writer = new WikipediaXMLWriter( + LOGGING_PATH_DIFFTOOL + LOGGING_PATH_DEBUG + + task.getHeader().getArticleName() + ".dbg"); + + writer.writeRevision(task); + writer.close(); + } + + throw ErrorFactory.createDiffException( + ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, + "Reconstruction of " + task + " failed at revision " + + revisionCounter + "."); + } + + // Throw again + } + catch (DiffException e) { + throw e; + + // Catch unexpected exceptions + } + catch (Exception e) { + throw ErrorFactory.createDiffException( + ErrorKeys.DIFFTOOL_DIFFCONSUMER_DIFF_VERIFICATION_FAILED, + "Reconstruction of " + task + " failed at revision " + + revisionCounter + ".", + e); + } + } + } } - queueA.add(new DiffBlock(-1, j, i, -1, -1, true)); - } + transmitAtEndOfTask(task, result); } - for (i = revBStartIndex; i <= revBEndIndex; i++) { - if (!revBBlocked[i]) { - j = i; - while (i + 1 <= revBEndIndex && !revBBlocked[++i]) { + /** + * Generates a Diff by using the CommonLongestSubstring search. + * + * @param revA + * previous revision + * @param revB + * current revision + * @return Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + private Diff generateDiff(final char[] revA, final char[] revB) + throws UnsupportedEncodingException + { + + blockCount = 0; + queueA = new ArrayList<>(); + queueB = new ArrayList<>(); + + revABlocked = new boolean[revA.length]; + revBBlocked = new boolean[revB.length]; + + int revAStartIndex = 0, revAEndIndex = revA.length - 1; + int revBStartIndex = 0, revBEndIndex = revB.length - 1; + + while (revAStartIndex <= revAEndIndex && revBStartIndex <= revBEndIndex + && revA[revAStartIndex] == revB[revBStartIndex]) { + + revABlocked[revAStartIndex] = true; + revBBlocked[revBStartIndex] = true; + revAStartIndex++; + revBStartIndex++; } - if (i + 1 > revBEndIndex) { - i++; + // First Block + if (revAStartIndex != 0) { + queueA.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, revBStartIndex, true)); + queueB.add(new DiffBlock(this.blockCount, 0, revAStartIndex, 0, revBStartIndex, false)); + this.blockCount++; } - queueB.add(new DiffBlock(-1, -1, -1, j, i, false)); - } - } + while (revAStartIndex < revAEndIndex && revBStartIndex < revBEndIndex + && revA[revAEndIndex] == revB[revBEndIndex]) { - Collections.sort(queueA); - Collections.sort(queueB); + revABlocked[revAEndIndex] = true; + revBBlocked[revBEndIndex] = true; + revAEndIndex--; + revBEndIndex--; + } - return blocks.manage(revA, revB, queueA, queueB); - } + // Last Block + if (revAEndIndex + 1 != revA.length) { + queueA.add(new DiffBlock(this.blockCount, revAEndIndex + 1, revA.length, + revBEndIndex + 1, revB.length, true)); + queueB.add(new DiffBlock(this.blockCount, revAEndIndex + 1, revA.length, + revBEndIndex + 1, revB.length, false)); + this.blockCount++; + } - /** - * Scans the input and creates the character -> position mapping. - * - * @param input character array - * @param start start position - * @param end end position - */ - private void scan(final char[] input, final int start, final int end) { + scan(revA, revAStartIndex, revAEndIndex); - this.positions = new HashMap<>(); - ArrayList list; + ArrayList list; + char c; - char c; - for (int i = start; i < end; i++) { - c = input[i]; + int i = revBStartIndex; + while (i < revBEndIndex) { - list = positions.computeIfAbsent(c, k -> new ArrayList<>()); + c = revB[i]; + list = positions.get(c); - list.add(i); - } - } - - /** - * Searches the longest common substring - * - * @param revA current revision - * @param list list of start positions for this substring search - * @param revB previous revision - * @param index start index previous revision - * @return TRUE if a legal substring was found FALSE otherwise - */ - private boolean findLongestMatch(final char[] revA, - final ArrayList list, final char[] revB, final int index) { - - int match; - longestMatch_size = -1; - - int size = list.size(); - int revAsize = revA.length; - int revBsize = revB.length; - - int start, end, count; - for (int i = 0; i < size; i++) { - - start = list.get(i); - if (!revABlocked[start] && !revBBlocked[index + 1]) { - - count = index + 1; - end = start + 1; - - while (end < revAsize && count < revBsize - && revA[end] == revB[count] && !revABlocked[end] - && !revBBlocked[count]) { - end++; - count++; + if (list != null && findLongestMatch(revA, list, revB, i)) { + + i += longestMatch_size; + } + else { + i++; + } + } + + int j; + for (i = revAStartIndex; i <= revAEndIndex; i++) { + if (!revABlocked[i]) { + j = i; + while (i + 1 <= revAEndIndex && !revABlocked[++i]) { + } + + if (i + 1 > revAEndIndex) { + i++; + } + + queueA.add(new DiffBlock(-1, j, i, -1, -1, true)); + } } - match = end - start; - if (match > longestMatch_size) { - longestMatch_size = match; - longestMatch_start = start; + for (i = revBStartIndex; i <= revBEndIndex; i++) { + if (!revBBlocked[i]) { + j = i; + while (i + 1 <= revBEndIndex && !revBBlocked[++i]) { + } + + if (i + 1 > revBEndIndex) { + i++; + } + + queueB.add(new DiffBlock(-1, -1, -1, j, i, false)); + } } - } + + Collections.sort(queueA); + Collections.sort(queueB); + + return blocks.manage(revA, revB, queueA, queueB); } - if (longestMatch_size <= VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING) { - return false; + /** + * Scans the input and creates the character -> position mapping. + * + * @param input + * character array + * @param start + * start position + * @param end + * end position + */ + private void scan(final char[] input, final int start, final int end) + { + + this.positions = new HashMap<>(); + ArrayList list; + + char c; + for (int i = start; i < end; i++) { + c = input[i]; + + list = positions.computeIfAbsent(c, k -> new ArrayList<>()); + + list.add(i); + } } - queueA.add(new DiffBlock(this.blockCount, longestMatch_start, - longestMatch_start + longestMatch_size, index, index - + longestMatch_size, true)); - queueB.add(new DiffBlock(this.blockCount, longestMatch_start, - longestMatch_start + longestMatch_size, index, index - + longestMatch_size, false)); + /** + * Searches the longest common substring + * + * @param revA + * current revision + * @param list + * list of start positions for this substring search + * @param revB + * previous revision + * @param index + * start index previous revision + * @return TRUE if a legal substring was found FALSE otherwise + */ + private boolean findLongestMatch(final char[] revA, final ArrayList list, + final char[] revB, final int index) + { + + int match; + longestMatch_size = -1; + + int size = list.size(); + int revAsize = revA.length; + int revBsize = revB.length; + + int start, end, count; + for (int i = 0; i < size; i++) { + + start = list.get(i); + if (!revABlocked[start] && !revBBlocked[index + 1]) { + + count = index + 1; + end = start + 1; + + while (end < revAsize && count < revBsize && revA[end] == revB[count] + && !revABlocked[end] && !revBBlocked[count]) { + end++; + count++; + } + + match = end - start; + if (match > longestMatch_size) { + longestMatch_size = match; + longestMatch_start = start; + } + } + } + + if (longestMatch_size <= VALUE_MINIMUM_LONGEST_COMMON_SUBSTRING) { + return false; + } + + queueA.add(new DiffBlock(this.blockCount, longestMatch_start, + longestMatch_start + longestMatch_size, index, index + longestMatch_size, true)); + queueB.add(new DiffBlock(this.blockCount, longestMatch_start, + longestMatch_start + longestMatch_size, index, index + longestMatch_size, false)); - blockCount++; + blockCount++; - for (int i = 0, j = longestMatch_start, k = index; i < longestMatch_size; i++, j++, k++) { - revABlocked[j] = true; - revBBlocked[k] = true; + for (int i = 0, j = longestMatch_start, k = index; i < longestMatch_size; i++, j++, k++) { + revABlocked[j] = true; + revBBlocked[k] = true; + } + + return true; } - return true; - } - - @Override - public void reset() { - this.result = null; - } + @Override + public void reset() + { + this.result = null; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java index 51d8f912..d4363d11 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/diff/calculation/TimedDiffCalculator.java @@ -32,139 +32,160 @@ /** * Calculates the Diff while collecting statistical data. */ -public class TimedDiffCalculator extends DiffCalculator { - - /** - * Temporary variable - revision counter - */ - private int revisionCounter; - - /** - * Temporary variable - diff part counter - */ - private int diffPartCounter; - - /** - * Temporary variable - size of the diff - */ - private long diffedSize; - - /** - * Temporary variable - start time of the diff processing - */ - private long startTime; - - /** - * Temporary variable - time used for the diff processing - */ - private long processingTimeDiff; - - /** - * Temporary variable - number of ignored revisions - */ - private int ignoredRevisionsCounter; - - /** - * (Constructor) Creates a new DiffCalculator object. - * - * @param taskTransmitter Reference to the TaskTransmitter - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public TimedDiffCalculator(final TaskTransmitterInterface taskTransmitter) throws ConfigurationException { - super(taskTransmitter); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Initializes the processing of a new RevisionTask. - * - * @param taskID Article ID - */ - protected void initNewTask(final int taskID) { - - super.initNewTask(taskID); - - this.processingTimeDiff = 0; - - this.revisionCounter = 0; - this.ignoredRevisionsCounter = 0; - - this.diffPartCounter = 0; - this.diffedSize = 0; - } - - /** - * Transmits a partial DiffTask. - * - * @param result Reference to the DiffTask - * @throws TimeoutException if a timeout occurred - */ - protected void transmitPartialTask(final Task result) throws TimeoutException { - - this.diffedSize += result.byteSize(); - this.processingTimeDiff += System.currentTimeMillis() - startTime; - - super.transmitPartialTask(result); - - startTime = System.currentTimeMillis(); - } - - /** - * Transmits the DiffTask at the end of the RevisionTask processing. - * - * @param task Reference to the RevisionTask - * @param result Reference to the DiffTask - * @throws TimeoutException if a timeout occurred - */ - protected void transmitAtEndOfTask(final Task task, final Task result) throws TimeoutException { - - this.processingTimeDiff += System.currentTimeMillis() - startTime; - - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { - - diffedSize += result.byteSize(); - - ArticleInformation info = result.getHeader(); - info.setRevisionCounter(revisionCounter); - info.setIgnoredRevisionsCounter(ignoredRevisionsCounter); - info.setDiffedSize(diffedSize); - info.setDiffPartCounter(diffPartCounter); - info.setProcessingTimeRead(task.getHeader().getProcessingTimeRead()); - info.setProcessingTimeDiff(processingTimeDiff); +public class TimedDiffCalculator + extends DiffCalculator +{ + + /** + * Temporary variable - revision counter + */ + private int revisionCounter; + + /** + * Temporary variable - diff part counter + */ + private int diffPartCounter; + + /** + * Temporary variable - size of the diff + */ + private long diffedSize; + + /** + * Temporary variable - start time of the diff processing + */ + private long startTime; + + /** + * Temporary variable - time used for the diff processing + */ + private long processingTimeDiff; + + /** + * Temporary variable - number of ignored revisions + */ + private int ignoredRevisionsCounter; + + /** + * (Constructor) Creates a new DiffCalculator object. + * + * @param taskTransmitter + * Reference to the TaskTransmitter + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public TimedDiffCalculator(final TaskTransmitterInterface taskTransmitter) + throws ConfigurationException + { + super(taskTransmitter); } - super.transmitAtEndOfTask(task, result); - } - - /** - * Calculates the diff for the given revision. - * - * @param revision Reference to a revision - * @return Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - */ - protected Diff processRevision(final Revision revision) throws UnsupportedEncodingException { - - Diff diff = super.processRevision(revision); - if (diff == null) { - this.ignoredRevisionsCounter++; - } else { - this.revisionCounter++; - this.diffPartCounter += diff.size(); + /*--------------------------------------------------------------------------*/ + + /** + * Initializes the processing of a new RevisionTask. + * + * @param taskID + * Article ID + */ + protected void initNewTask(final int taskID) + { + + super.initNewTask(taskID); + + this.processingTimeDiff = 0; + + this.revisionCounter = 0; + this.ignoredRevisionsCounter = 0; + + this.diffPartCounter = 0; + this.diffedSize = 0; + } + + /** + * Transmits a partial DiffTask. + * + * @param result + * Reference to the DiffTask + * @throws TimeoutException + * if a timeout occurred + */ + protected void transmitPartialTask(final Task result) throws TimeoutException + { + + this.diffedSize += result.byteSize(); + this.processingTimeDiff += System.currentTimeMillis() - startTime; + + super.transmitPartialTask(result); + + startTime = System.currentTimeMillis(); } - return diff; - } + /** + * Transmits the DiffTask at the end of the RevisionTask processing. + * + * @param task + * Reference to the RevisionTask + * @param result + * Reference to the DiffTask + * @throws TimeoutException + * if a timeout occurred + */ + protected void transmitAtEndOfTask(final Task task, final Task result) + throws TimeoutException + { + + this.processingTimeDiff += System.currentTimeMillis() - startTime; + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + diffedSize += result.byteSize(); + + ArticleInformation info = result.getHeader(); + info.setRevisionCounter(revisionCounter); + info.setIgnoredRevisionsCounter(ignoredRevisionsCounter); + info.setDiffedSize(diffedSize); + info.setDiffPartCounter(diffPartCounter); + info.setProcessingTimeRead(task.getHeader().getProcessingTimeRead()); + info.setProcessingTimeDiff(processingTimeDiff); + } + + super.transmitAtEndOfTask(task, result); + } - /*--------------------------------------------------------------------------*/ + /** + * Calculates the diff for the given revision. + * + * @param revision + * Reference to a revision + * @return Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + */ + protected Diff processRevision(final Revision revision) throws UnsupportedEncodingException + { + + Diff diff = super.processRevision(revision); + if (diff == null) { + this.ignoredRevisionsCounter++; + } + else { + this.revisionCounter++; + this.diffPartCounter += diff.size(); + } + + return diff; + } + + /*--------------------------------------------------------------------------*/ - @Override - public void process(final Task task) - throws DiffException, TimeoutException, UnsupportedEncodingException { + @Override + public void process(final Task task) + throws DiffException, TimeoutException, UnsupportedEncodingException + { - this.startTime = System.currentTimeMillis(); - super.process(task); - } + this.startTime = System.currentTimeMillis(); + super.process(task); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java index 6d54981a..5d1735bd 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/SQLEscape.java @@ -18,69 +18,73 @@ package org.dkpro.jwpl.revisionmachine.difftool.consumer.dump; /** - * The single method {@link SQLEscape#escape(String)} removes all unwanted escape - * characters from a string to make is UNCOMPRESSED conform. Maybe not thread-save. + * The single method {@link SQLEscape#escape(String)} removes all unwanted escape characters from a + * string to make is UNCOMPRESSED conform. Maybe not thread-save. *

* Copied from the WikiMachine to avoid having to add dependency. */ -public class SQLEscape { - private SQLEscape() { +public class SQLEscape +{ + private SQLEscape() + { - } + } - /** - * @param str unescaped String - * @return String with escape characters - * @see SQLEscape - */ - public static String escape(String str) { - final int len = str.length(); + /** + * @param str + * unescaped String + * @return String with escape characters + * @see SQLEscape + */ + public static String escape(String str) + { + final int len = str.length(); - // maybe the StringBuffer would be safer? - StringBuilder sql = new StringBuilder(len * 2); + // maybe the StringBuffer would be safer? + StringBuilder sql = new StringBuilder(len * 2); - for (int i = 0; i < len; i++) { - char c = str.charAt(i); - switch (c) { - case '\u0000': - sql.append('\\').append('0'); - break; - case '\n': - sql.append('\\').append('n'); - break; - case '\t': - sql.append('\\').append('t'); - break; - case '\r': - sql.append('\\').append('r'); - break; - case '\u001a': - sql.append('\\').append('Z'); - break; - case '\'': - sql.append('\\').append('\''); - break; - case '\"': - sql.append('\\').append('"'); - break; - case '\b': - sql.append('\\').append('b'); - break; - case '\\': - sql.append('\\').append('\\'); - break; -// case '%': -// sql.append('[').append('%').append(']'); -// break; -// case '_': -// sql.append('[').append('_').append(']'); -// break; - default: - sql.append(c); - break; - } + for (int i = 0; i < len; i++) { + char c = str.charAt(i); + switch (c) { + case '\u0000': + sql.append('\\').append('0'); + break; + case '\n': + sql.append('\\').append('n'); + break; + case '\t': + sql.append('\\').append('t'); + break; + case '\r': + sql.append('\\').append('r'); + break; + case '\u001a': + sql.append('\\').append('Z'); + break; + case '\'': + sql.append('\\').append('\''); + break; + case '\"': + sql.append('\\').append('"'); + break; + case '\b': + sql.append('\\').append('b'); + break; + case '\\': + sql.append('\\').append('\\'); + break; + // case '%': + // sql.append('[').append('%').append(']'); + // break; + // case '_': + // sql.append('[').append('_').append(']'); + // break; + default: + sql.append(c); + break; + } + } + return sql.toString(); } - return sql.toString(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java index 12db8b66..c18d1243 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/WriterInterface.java @@ -28,27 +28,31 @@ /** * Represents the link to the output writer. */ -public interface WriterInterface { +public interface WriterInterface +{ - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException; + /** + * This method will process the given DiffTask and send him to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException; - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - * @throws SQLException if problems occurred while closing the connection to the - * database. - */ - void close() throws IOException, SQLException; + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + * @throws SQLException + * if problems occurred while closing the connection to the database. + */ + void close() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java index 25b3d3b9..2652b29b 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/DataFileEncoder.java @@ -33,110 +33,112 @@ /** * Alternative to the SQLEncoder - writes data files instead of UNCOMPRESSED dumps */ -public class DataFileEncoder { - - /** - * Reference to the RevisionApi - */ - private final RevisionEncoderInterface encoder; - - /** - * Last used ID of a full revision - */ - private int lastFullRevID = -1; - - /** - * (Constructor) Creates a new SQLEncoder object. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public DataFileEncoder() throws ConfigurationException { - this.encoder = new RevisionEncoder(); - } - - /** - * Encodes the diff. - * - * @param task Reference to the DiffTask - * @param diff Diff to encode - * @return Base 64 encoded Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws EncodingException if the encoding failed - */ - protected String encodeDiff(final Task task, final Diff diff) - throws UnsupportedEncodingException, EncodingException { - - return encoder.encodeDiff(diff.getCodecData(), diff); - } - - public List encodeTask(final Task task) - throws UnsupportedEncodingException, DecodingException, EncodingException { - - // this.task = task; - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { - - this.lastFullRevID = -1; +public class DataFileEncoder +{ + + /** + * Reference to the RevisionApi + */ + private final RevisionEncoderInterface encoder; + + /** + * Last used ID of a full revision + */ + private int lastFullRevID = -1; + + /** + * (Constructor) Creates a new SQLEncoder object. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public DataFileEncoder() throws ConfigurationException + { + this.encoder = new RevisionEncoder(); } - int articleId = task.getHeader().getArticleId(); - Diff diff; - - ArrayList list = new ArrayList<>(); - - String tempData; - - int size = task.size(); - for (int i = 0; i < size; i++) { - - diff = task.get(i); - - if (diff.isFullRevision()) { - this.lastFullRevID = diff.getRevisionID(); - } - - /* - * prepare values that might be null - * because we don't want quotes if they are null - * - * Furthermore, escape quote-characters. Quotes are used as the "ENCLOSED BY" character - * in MySQL to mark begin and end of Strings - */ - - //prepare values that might be null - //because we don't want quotes if they are null - String comm = diff.getComment(); - String comment = comm == null ? "\\N" : "\"" + escape(comm) + "\""; - - Integer cId = diff.getContributorId(); - String contributorId = cId == null ? "\\N" : cId.toString(); - - String cName = diff.getContributorName(); - String contributorName = cName == null ? "\\N" : "\"" + escape(cName) + "\""; - - //Prepare the actual data item - tempData = "\\N," - + this.lastFullRevID + "," - + diff.getRevisionCounter() + "," - + diff.getRevisionID() + "," - + articleId + "," - + diff.getTimeStamp().getTime() + ",\"" - + encodeDiff(task, diff) + "\"," - + comment + "," - + (diff.isMinor() ? "1" : "0") + "," - + contributorName + "," - + contributorId + "," - + (diff.getContributorIsRegistered() ? "1" : "0"); - - //add item to the list - list.add(tempData); + /** + * Encodes the diff. + * + * @param task + * Reference to the DiffTask + * @param diff + * Diff to encode + * @return Base 64 encoded Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws EncodingException + * if the encoding failed + */ + protected String encodeDiff(final Task task, final Diff diff) + throws UnsupportedEncodingException, EncodingException + { + + return encoder.encodeDiff(diff.getCodecData(), diff); } - return list; - } + public List encodeTask(final Task task) + throws UnsupportedEncodingException, DecodingException, EncodingException + { - private String escape(String str) { - return str.replaceAll("\\\\", "\\\\\\\\").replaceAll("\"", "\\\\\""); - } + // this.task = task; + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { + + this.lastFullRevID = -1; + } + + int articleId = task.getHeader().getArticleId(); + Diff diff; + + ArrayList list = new ArrayList<>(); + + String tempData; + + int size = task.size(); + for (int i = 0; i < size; i++) { + + diff = task.get(i); + + if (diff.isFullRevision()) { + this.lastFullRevID = diff.getRevisionID(); + } + + /* + * prepare values that might be null because we don't want quotes if they are null + * + * Furthermore, escape quote-characters. Quotes are used as the "ENCLOSED BY" character + * in MySQL to mark begin and end of Strings + */ + + // prepare values that might be null + // because we don't want quotes if they are null + String comm = diff.getComment(); + String comment = comm == null ? "\\N" : "\"" + escape(comm) + "\""; + + Integer cId = diff.getContributorId(); + String contributorId = cId == null ? "\\N" : cId.toString(); + + String cName = diff.getContributorName(); + String contributorName = cName == null ? "\\N" : "\"" + escape(cName) + "\""; + + // Prepare the actual data item + tempData = "\\N," + this.lastFullRevID + "," + diff.getRevisionCounter() + "," + + diff.getRevisionID() + "," + articleId + "," + diff.getTimeStamp().getTime() + + ",\"" + encodeDiff(task, diff) + "\"," + comment + "," + + (diff.isMinor() ? "1" : "0") + "," + contributorName + "," + contributorId + + "," + (diff.getContributorIsRegistered() ? "1" : "0"); + + // add item to the list + list.add(tempData); + } + + return list; + } + + private String escape(String str) + { + return str.replaceAll("\\\\", "\\\\\\\\").replaceAll("\"", "\\\\\""); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java index 09b760bd..16dd6ba7 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoder.java @@ -46,434 +46,451 @@ /** * This creates the SQL statements */ -public class SQLEncoder implements SQLEncoderInterface { - - /** - * UNCOMPRESSED Statement for tables containing binary encoded diff information - */ - private final String binaryTableRevision; - - /** - * Reference to the RevisionApi - */ - private final RevisionEncoderInterface encoder; - - /** - * Last used ID of a full revision - */ - private int lastFullRevID = -1; - - /** - * Configuration parameter - Maximum size of a sql statement - */ - private final long LIMIT_SQL_STATEMENT_SIZE; - - /** - * Reference to the logger - */ - private final Logger logger; - - /** - * Configuration parameter - Path for the debug logger - */ - private final String LOGGING_PATH_DEBUG; - - /** - * Configuration parameter - Path for the DiffTool logger - */ - private final String LOGGING_PATH_DIFFTOOL; - - /** - * Configuration parameter - Flag, which indicates whether debug output is - * enabled or not - */ - private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; - - /** - * Configuration parameter - Surrogate Mode - */ - private final SurrogateModes MODE_SURROGATES; - - /** - * UNCOMPRESSED Statement for tables containing base 64 encoded diff information - */ - private final String tableRevision; - - /** - * Configuration parameter - Flag, which indicates whether the verification - * of the encoding is enabled or not - */ - private final boolean VERIFICATION_ENCODING; - - /** - * Configuration Parameter - Wikipedia Encoding - */ - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new SQLEncoder object. - * - * @param logger Reference to the logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public SQLEncoder(final Logger logger) - throws ConfigurationException { - - this.logger = logger; - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config - .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); - - VERIFICATION_ENCODING = (Boolean) config - .getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); - - LOGGING_PATH_DIFFTOOL = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); - - LOGGING_PATH_DEBUG = (String) config - .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); - - LIMIT_SQL_STATEMENT_SIZE = (Long) config - .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); - - MODE_SURROGATES = (SurrogateModes) config - .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); - - WIKIPEDIA_ENCODING = (String) config - .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - this.encoder = new RevisionEncoder(); - - tableRevision = "CREATE TABLE IF NOT EXISTS revisions (" - + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " - + "FullRevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionCounter INTEGER UNSIGNED NOT NULL, " - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Timestamp BIGINT NOT NULL, " - + "Revision MEDIUMTEXT NOT NULL, " - + "Comment MEDIUMTEXT, " - + "Minor TINYINT NOT NULL, " - + "ContributorName TEXT NOT NULL, " - + "ContributorId INTEGER UNSIGNED, " - + "ContributorIsRegistered TINYINT NOT NULL, " - + "PRIMARY KEY(PrimaryKey)" - + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; - - binaryTableRevision = "CREATE TABLE IF NOT EXISTS revisions (" - + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " - + "FullRevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionCounter INTEGER UNSIGNED NOT NULL, " - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Timestamp BIGINT NOT NULL, " - + "Revision MEDIUMBLOB NOT NULL," - + "Comment MEDIUMTEXT, " - + "Minor TINYINT NOT NULL, " - + "ContributorName TEXT NOT NULL, " - + "ContributorId INTEGER UNSIGNED, " - + "ContributorIsRegistered TINYINT NOT NULL, " - + "PRIMARY KEY(PrimaryKey)" - + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; - - } - - protected byte[] binaryDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { - - RevisionCodecData codecData = diff.getCodecData(); - byte[] encoding = encoder.binaryDiff(codecData, diff); - - if (VERIFICATION_ENCODING) { - RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); - decoder.setInput(encoding); - Diff decDiff = decoder.decode(); - - verify(task, decDiff, diff); - } +public class SQLEncoder + implements SQLEncoderInterface +{ + + /** + * UNCOMPRESSED Statement for tables containing binary encoded diff information + */ + private final String binaryTableRevision; + + /** + * Reference to the RevisionApi + */ + private final RevisionEncoderInterface encoder; + + /** + * Last used ID of a full revision + */ + private int lastFullRevID = -1; + + /** + * Configuration parameter - Maximum size of a sql statement + */ + private final long LIMIT_SQL_STATEMENT_SIZE; + + /** + * Reference to the logger + */ + private final Logger logger; + + /** + * Configuration parameter - Path for the debug logger + */ + private final String LOGGING_PATH_DEBUG; + + /** + * Configuration parameter - Path for the DiffTool logger + */ + private final String LOGGING_PATH_DIFFTOOL; + + /** + * Configuration parameter - Flag, which indicates whether debug output is enabled or not + */ + private final boolean MODE_DEBUG_OUTPUT_ACTIVATED; + + /** + * Configuration parameter - Surrogate Mode + */ + private final SurrogateModes MODE_SURROGATES; + + /** + * UNCOMPRESSED Statement for tables containing base 64 encoded diff information + */ + private final String tableRevision; + + /** + * Configuration parameter - Flag, which indicates whether the verification of the encoding is + * enabled or not + */ + private final boolean VERIFICATION_ENCODING; + + /** + * Configuration Parameter - Wikipedia Encoding + */ + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new SQLEncoder object. + * + * @param logger + * Reference to the logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public SQLEncoder(final Logger logger) throws ConfigurationException + { + + this.logger = logger; + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + MODE_DEBUG_OUTPUT_ACTIVATED = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_DEBUG_OUTPUT); + + VERIFICATION_ENCODING = (Boolean) config + .getConfigParameter(ConfigurationKeys.VERIFICATION_ENCODING); + + LOGGING_PATH_DIFFTOOL = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DIFFTOOL); + + LOGGING_PATH_DEBUG = (String) config + .getConfigParameter(ConfigurationKeys.LOGGING_PATH_DEBUG); + + LIMIT_SQL_STATEMENT_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQLSERVER_MAX_ALLOWED_PACKET); + + MODE_SURROGATES = (SurrogateModes) config + .getConfigParameter(ConfigurationKeys.MODE_SURROGATES); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + this.encoder = new RevisionEncoder(); + + tableRevision = "CREATE TABLE IF NOT EXISTS revisions (" + + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " + + "FullRevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionCounter INTEGER UNSIGNED NOT NULL, " + + "RevisionID INTEGER UNSIGNED NOT NULL, " + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Timestamp BIGINT NOT NULL, " + "Revision MEDIUMTEXT NOT NULL, " + + "Comment MEDIUMTEXT, " + "Minor TINYINT NOT NULL, " + + "ContributorName TEXT NOT NULL, " + "ContributorId INTEGER UNSIGNED, " + + "ContributorIsRegistered TINYINT NOT NULL, " + "PRIMARY KEY(PrimaryKey)" + + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; + + binaryTableRevision = "CREATE TABLE IF NOT EXISTS revisions (" + + "PrimaryKey INT UNSIGNED NOT NULL AUTO_INCREMENT, " + + "FullRevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionCounter INTEGER UNSIGNED NOT NULL, " + + "RevisionID INTEGER UNSIGNED NOT NULL, " + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Timestamp BIGINT NOT NULL, " + "Revision MEDIUMBLOB NOT NULL," + + "Comment MEDIUMTEXT, " + "Minor TINYINT NOT NULL, " + + "ContributorName TEXT NOT NULL, " + "ContributorId INTEGER UNSIGNED, " + + "ContributorIsRegistered TINYINT NOT NULL, " + "PRIMARY KEY(PrimaryKey)" + + ") TYPE = MyISAM DEFAULT CHARSET utf8 COLLATE utf8_general_ci;"; - return encoding; - } + } - @Override - public SQLEncoding[] binaryTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { + protected byte[] binaryDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { - // this.task = task; - if (task.getTaskType() == TaskTypes.TASK_FULL || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { - this.lastFullRevID = -1; - } + RevisionCodecData codecData = diff.getCodecData(); + byte[] encoding = encoder.binaryDiff(codecData, diff); - int articleId = task.getHeader().getArticleId(); - Diff diff; + if (VERIFICATION_ENCODING) { + RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); + decoder.setInput(encoding); + Diff decDiff = decoder.decode(); - ArrayList list = new ArrayList<>(); + verify(task, decDiff, diff); + } - SQLEncoding revisionsEncoding = new SQLEncoding(); - SQLEncoding usersEncoding = new SQLEncoding(); - revisionsEncoding.append("INSERT INTO revisions VALUES"); - usersEncoding.append("INSERT INTO users VALUES"); + return encoding; + } - byte[] tempBinaryData; - String tempData; + @Override + public SQLEncoding[] binaryTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { - int size = task.size(); - for (int i = 0; i < size; i++) { - diff = task.get(i); + // this.task = task; + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { + this.lastFullRevID = -1; + } - /* - * Process revision table - */ - if (diff.isFullRevision()) { - this.lastFullRevID = diff.getRevisionID(); - } + int articleId = task.getHeader().getArticleId(); + Diff diff; - //prepare values that might be null - //because we don't want quotes if they are null - String comm = diff.getComment(); - String comment = comm == null ? null : "'" + comm + "'"; + ArrayList list = new ArrayList<>(); - Integer cId = diff.getContributorId(); - String contributorId = cId == null ? null : cId.toString(); + SQLEncoding revisionsEncoding = new SQLEncoding(); + SQLEncoding usersEncoding = new SQLEncoding(); + revisionsEncoding.append("INSERT INTO revisions VALUES"); + usersEncoding.append("INSERT INTO users VALUES"); - // save the query and binary data temporary - tempData = "(null, " + this.lastFullRevID + "," - + diff.getRevisionCounter() + "," + diff.getRevisionID() - + "," + articleId + "," + diff.getTimeStamp().getTime() - + ",?," + comment + "," + (diff.isMinor() ? "1" : "0") + "," + contributorId + "," + (diff.getContributorIsRegistered() ? "1" : "0") + ")"; - tempBinaryData = binaryDiff(task, diff); + byte[] tempBinaryData; + String tempData; - // if the limit would be reached start a new encoding - if ((revisionsEncoding.byteSize() + tempBinaryData.length + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i != 0)) { - revisionsEncoding.append(";"); - list.add(revisionsEncoding); + int size = task.size(); + for (int i = 0; i < size; i++) { + diff = task.get(i); - revisionsEncoding = new SQLEncoding(); - revisionsEncoding.append("INSERT INTO revisions VALUES"); - } + /* + * Process revision table + */ + if (diff.isFullRevision()) { + this.lastFullRevID = diff.getRevisionID(); + } - if (revisionsEncoding.size() > 0) { - revisionsEncoding.append(","); - } - revisionsEncoding.append(tempData); - revisionsEncoding.addBinaryData(tempBinaryData); + // prepare values that might be null + // because we don't want quotes if they are null + String comm = diff.getComment(); + String comment = comm == null ? null : "'" + comm + "'"; + + Integer cId = diff.getContributorId(); + String contributorId = cId == null ? null : cId.toString(); + + // save the query and binary data temporary + tempData = "(null, " + this.lastFullRevID + "," + diff.getRevisionCounter() + "," + + diff.getRevisionID() + "," + articleId + "," + diff.getTimeStamp().getTime() + + ",?," + comment + "," + (diff.isMinor() ? "1" : "0") + "," + contributorId + + "," + (diff.getContributorIsRegistered() ? "1" : "0") + ")"; + tempBinaryData = binaryDiff(task, diff); + + // if the limit would be reached start a new encoding + if ((revisionsEncoding.byteSize() + tempBinaryData.length + + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i != 0)) { + revisionsEncoding.append(";"); + list.add(revisionsEncoding); + + revisionsEncoding = new SQLEncoding(); + revisionsEncoding.append("INSERT INTO revisions VALUES"); + } - } + if (revisionsEncoding.size() > 0) { + revisionsEncoding.append(","); + } + revisionsEncoding.append(tempData); + revisionsEncoding.addBinaryData(tempBinaryData); - // Add the pending encoding - if (revisionsEncoding.size() > 0) { - revisionsEncoding.append(";"); - list.add(revisionsEncoding); - } + } + // Add the pending encoding + if (revisionsEncoding.size() > 0) { + revisionsEncoding.append(";"); + list.add(revisionsEncoding); + } - // Transform the list into an array - SQLEncoding[] queries = new SQLEncoding[list.size()]; - return list.toArray(queries); - } - - /** - * Encodes the diff. - * - * @param task Reference to the DiffTask - * @param diff Diff to encode - * @return Base 64 encoded Diff - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - * @throws EncodingException if the encoding failed - * @throws SQLConsumerException if an error occurred while encoding the diff - */ - protected String encodeDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { - - RevisionCodecData codecData = diff.getCodecData(); - String encoding = encoder.encodeDiff(codecData, diff); - - if (VERIFICATION_ENCODING) { - RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); - decoder.setInput(encoding); - Diff decDiff = decoder.decode(); - - verify(task, decDiff, diff); + // Transform the list into an array + SQLEncoding[] queries = new SQLEncoding[list.size()]; + return list.toArray(queries); } - return encoding; - } + /** + * Encodes the diff. + * + * @param task + * Reference to the DiffTask + * @param diff + * Diff to encode + * @return Base 64 encoded Diff + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + * @throws EncodingException + * if the encoding failed + * @throws SQLConsumerException + * if an error occurred while encoding the diff + */ + protected String encodeDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { + + RevisionCodecData codecData = diff.getCodecData(); + String encoding = encoder.encodeDiff(codecData, diff); + + if (VERIFICATION_ENCODING) { + RevisionDecoder decoder = new RevisionDecoder(WIKIPEDIA_ENCODING); + decoder.setInput(encoding); + Diff decDiff = decoder.decode(); + + verify(task, decDiff, diff); + } - @Override - public SQLEncoding[] encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { + return encoding; + } - // this.task = task; - if (task.getTaskType() == TaskTypes.TASK_FULL || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { + @Override + public SQLEncoding[] encodeTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { - this.lastFullRevID = -1; - } + // this.task = task; + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_FIRST) { - int articleId = task.getHeader().getArticleId(); - Diff diff; + this.lastFullRevID = -1; + } - ArrayList list = new ArrayList<>(); + int articleId = task.getHeader().getArticleId(); + Diff diff; - SQLEncoding revisionEncoding = new SQLEncoding(); - revisionEncoding.append("INSERT INTO revisions VALUES"); + ArrayList list = new ArrayList<>(); - String tempData; + SQLEncoding revisionEncoding = new SQLEncoding(); + revisionEncoding.append("INSERT INTO revisions VALUES"); - int size = task.size(); - for (int i = 0; i < size; i++) { + String tempData; - diff = task.get(i); + int size = task.size(); + for (int i = 0; i < size; i++) { - /* - * Process revision table - */ - if (diff.isFullRevision()) { - this.lastFullRevID = diff.getRevisionID(); - } + diff = task.get(i); - //prepare values that might be null - //because we don't want quotes if they are null - String comm = diff.getComment(); - String comment = comm == null ? null : "'" + comm + "'"; + /* + * Process revision table + */ + if (diff.isFullRevision()) { + this.lastFullRevID = diff.getRevisionID(); + } - Integer cId = diff.getContributorId(); - String contributorId = cId == null ? null : cId.toString(); + // prepare values that might be null + // because we don't want quotes if they are null + String comm = diff.getComment(); + String comment = comm == null ? null : "'" + comm + "'"; + + Integer cId = diff.getContributorId(); + String contributorId = cId == null ? null : cId.toString(); + + // save the query temporary + tempData = "(null," + this.lastFullRevID + "," + diff.getRevisionCounter() + "," + + diff.getRevisionID() + "," + articleId + "," + diff.getTimeStamp().getTime() + + ",'" + encodeDiff(task, diff) + "'," + comment + "," + + (diff.isMinor() ? "1" : "0") + ",'" + diff.getContributorName() + "'," + + contributorId + "," + (diff.getContributorIsRegistered() ? "1" : "0") + ")"; + + // if the limit would be reached start a new encoding + if ((revisionEncoding.byteSize() + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) + && (i != 0)) { + revisionEncoding.append(";"); + list.add(revisionEncoding); + + revisionEncoding = new SQLEncoding(); + revisionEncoding.append("INSERT INTO revisions VALUES"); + } - // save the query temporary - tempData = "(null," + this.lastFullRevID + "," - + diff.getRevisionCounter() + "," + diff.getRevisionID() - + "," + articleId + "," + diff.getTimeStamp().getTime() - + ",'" + encodeDiff(task, diff) + "'," + comment + "," + (diff.isMinor() ? "1" : "0") + ",'" + diff.getContributorName() + "'," + contributorId + "," + (diff.getContributorIsRegistered() ? "1" : "0") + ")"; + if (revisionEncoding.byteSize() > 30) { + revisionEncoding.append(","); + } + revisionEncoding.append(tempData); - // if the limit would be reached start a new encoding - if ((revisionEncoding.byteSize() + tempData.length() >= LIMIT_SQL_STATEMENT_SIZE) && (i != 0)) { - revisionEncoding.append(";"); - list.add(revisionEncoding); + } - revisionEncoding = new SQLEncoding(); - revisionEncoding.append("INSERT INTO revisions VALUES"); - } + // Add the pending encodings + if (revisionEncoding.byteSize() > 30) { + revisionEncoding.append(";"); + list.add(revisionEncoding); + } - if (revisionEncoding.byteSize() > 30) { - revisionEncoding.append(","); - } - revisionEncoding.append(tempData); + // Transform the list into an array + SQLEncoding[] queries = new SQLEncoding[list.size()]; + return list.toArray(queries); + } + @Override + public String[] getBinaryTable() + { + return new String[] { binaryTableRevision }; } - // Add the pending encodings - if (revisionEncoding.byteSize() > 30) { - revisionEncoding.append(";"); - list.add(revisionEncoding); + @Override + public String[] getTable() + { + return new String[] { tableRevision }; } - // Transform the list into an array - SQLEncoding[] queries = new SQLEncoding[list.size()]; - return list.toArray(queries); - } + /** + * Verifies that the decoded diff is identical to the original diff. + * + * @param task + * DiffTask + * @param decodedDiff + * diff created from encoding the decoded diff information + * @param originalDiff + * original diff + * @throws SQLConsumerException + * if an error occurs + */ + private void verify(final Task task, final Diff decodedDiff, final Diff originalDiff) + throws SQLConsumerException + { - @Override - public String[] getBinaryTable() { - return new String[]{binaryTableRevision}; - } + String orig = originalDiff.toString(); + String deco = decodedDiff.toString(); - @Override - public String[] getTable() { - return new String[]{tableRevision}; - } + boolean notEqual = !orig.equals(deco); - /** - * Verifies that the decoded diff is identical to the original diff. - * - * @param task DiffTask - * @param decodedDiff diff created from encoding the decoded diff information - * @param originalDiff original diff - * @throws SQLConsumerException if an error occurs - */ - private void verify(final Task task, final Diff decodedDiff, final Diff originalDiff) - throws SQLConsumerException { + if (notEqual && MODE_SURROGATES == SurrogateModes.REPLACE) { - String orig = originalDiff.toString(); - String deco = decodedDiff.toString(); + char[] origDiff = orig.toCharArray(); - boolean notEqual = !orig.equals(deco); + // TODO: test + if (Surrogates.scan(origDiff)) { - if (notEqual && MODE_SURROGATES == SurrogateModes.REPLACE) { + String repDiff = new String(Surrogates.replace(origDiff)); + notEqual = !repDiff.equals(deco); + } + } - char[] origDiff = orig.toCharArray(); + if (notEqual) { - // TODO: test - if (Surrogates.scan(origDiff)) { + if (MODE_DEBUG_OUTPUT_ACTIVATED) { - String repDiff = new String(Surrogates.replace(origDiff)); - notEqual = !repDiff.equals(deco); - } - } + try { - if (notEqual) { + WikipediaXMLWriter writer = new WikipediaXMLWriter(LOGGING_PATH_DIFFTOOL + + LOGGING_PATH_DEBUG + task.getHeader().getArticleName() + ".dbg"); - if (MODE_DEBUG_OUTPUT_ACTIVATED) { + switch (task.getTaskType()) { + case TASK_FULL: + case TASK_PARTIAL_FIRST: + writer.writeDiff(task); + break; - try { + case TASK_PARTIAL: + case TASK_PARTIAL_LAST: { - WikipediaXMLWriter writer = new WikipediaXMLWriter(LOGGING_PATH_DIFFTOOL + LOGGING_PATH_DEBUG - + task.getHeader().getArticleName() + ".dbg"); + int revCount = originalDiff.getRevisionCounter(); + Diff d; + boolean fullRev = false; - switch (task.getTaskType()) { - case TASK_FULL: - case TASK_PARTIAL_FIRST: - writer.writeDiff(task); - break; + for (int diffCount = 0; !fullRev + && diffCount < originalDiff.size(); diffCount++) { - case TASK_PARTIAL: - case TASK_PARTIAL_LAST: { + d = task.get(diffCount); + if (d.getRevisionCounter() <= revCount && d.isFullRevision()) { + fullRev = true; + writer.writeDiff(task, diffCount); + } + } - int revCount = originalDiff.getRevisionCounter(); - Diff d; - boolean fullRev = false; + if (!fullRev) { + writer.writeDiffFile(task); + } - for (int diffCount = 0; !fullRev - && diffCount < originalDiff.size(); diffCount++) { + } + break; + default: + throw new IOException("Unknown TaskType"); + // TODO: Debug output + } - d = task.get(diffCount); - if (d.getRevisionCounter() <= revCount - && d.isFullRevision()) { - fullRev = true; - writer.writeDiff(task, diffCount); + writer.close(); + } + catch (IOException e) { + ConsumerLogMessages.logException(logger, e); } - } - - if (!fullRev) { - writer.writeDiffFile(task); - } - } - break; - default: - throw new IOException("Unknown TaskType"); - // TODO: Debug output - } - - writer.close(); - } catch (IOException e) { - ConsumerLogMessages.logException(logger, e); - } - } - throw ErrorFactory.createSQLConsumerException(ErrorKeys.DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, - "Redecoding of " + task.getHeader().getArticleName() - + " failed at revision " + originalDiff.getRevisionCounter() + "."); + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_ENCODING_VERIFICATION_FAILED, + "Redecoding of " + task.getHeader().getArticleName() + " failed at revision " + + originalDiff.getRevisionCounter() + "."); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java index 1ae55488..fc84c911 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoderInterface.java @@ -27,65 +27,74 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.content.Diff; /** - * The SQLEncoderInterface provides the link to the SQLEncoder who will define - * the formatting of the output. + * The SQLEncoderInterface provides the link to the SQLEncoder who will define the formatting of the + * output. */ -public interface SQLEncoderInterface { +public interface SQLEncoderInterface +{ - /** - * Returns the tables for textual output. - *

- * Each Array entry will contain a single sql command. - * - * @return sql command to create the tables - */ - String[] getTable(); + /** + * Returns the tables for textual output. + *

+ * Each Array entry will contain a single sql command. + * + * @return sql command to create the tables + */ + String[] getTable(); - /** - * Returns the tables for binary output. - *

- * Each Array entry will contain a single sql command. - * - * @return sql command to create the tables - */ - String[] getBinaryTable(); + /** + * Returns the tables for binary output. + *

+ * Each Array entry will contain a single sql command. + * + * @return sql command to create the tables + */ + String[] getBinaryTable(); - /** - * Returns the binary encoding of the given DiffTask. - *

- * Each Array entry will contain a single sql command. - * - * @param task DiffTask - * @return binary encoding of the task. - * @throws ConfigurationException if problems occurred while initializing the components - * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not - * supported by JAVA. - * @throws DecodingException if the decoding process fails (during the verification - * process) - * @throws EncodingException if the encoding process fails - * @throws SQLConsumerException if the verification process fails - */ - SQLEncoding[] binaryTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException; + /** + * Returns the binary encoding of the given DiffTask. + *

+ * Each Array entry will contain a single sql command. + * + * @param task + * DiffTask + * @return binary encoding of the task. + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws UnsupportedEncodingException + * if the CharacterSet defined in the configuration is not supported by JAVA. + * @throws DecodingException + * if the decoding process fails (during the verification process) + * @throws EncodingException + * if the encoding process fails + * @throws SQLConsumerException + * if the verification process fails + */ + SQLEncoding[] binaryTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException; - /** - * Returns the textual encoding of the given DiffTask. - *

- * Each Array entry will contain a single sql command. - * - * @param task DiffTask - * @return binary encoding of the task. - * @throws ConfigurationException if problems occurred while initializing the components - * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not - * supported by JAVA. - * @throws DecodingException if the decoding process fails (during the verification - * process) - * @throws EncodingException if the encoding process fails - * @throws SQLConsumerException if the verification process fails - */ - SQLEncoding[] encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException; + /** + * Returns the textual encoding of the given DiffTask. + *

+ * Each Array entry will contain a single sql command. + * + * @param task + * DiffTask + * @return binary encoding of the task. + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws UnsupportedEncodingException + * if the CharacterSet defined in the configuration is not supported by JAVA. + * @throws DecodingException + * if the decoding process fails (during the verification process) + * @throws EncodingException + * if the encoding process fails + * @throws SQLConsumerException + * if the verification process fails + */ + SQLEncoding[] encodeTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java index 757b84ac..2367a298 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/SQLEncoding.java @@ -23,109 +23,122 @@ /** * This class is used to store the sql statements. */ -public class SQLEncoding { - - /** - * UNCOMPRESSED Query - */ - private final StringBuilder query; - - /** - * List of binary data - */ - private final List list; - - /** - * Size of binary data - */ - private int binaryDataSize; - - /** - * (Constructor) Creates a new SQLEncoding object. - */ - public SQLEncoding() { - this.query = new StringBuilder(); - this.list = new ArrayList<>(); - this.binaryDataSize = 0; - } - - /** - * Appends textual content to the query. - * - * @param seq textual content - */ - public void append(final CharSequence seq) { - this.query.append(seq); - } - - /** - * Appends binary data to storage. - * - * @param bData binary data - */ - public void addBinaryData(final byte[] bData) { - this.binaryDataSize += bData.length; - this.list.add(bData); - } - - /** - * Returns the size of the query. - * - * @return size of the query - */ - public int byteSize() { - return this.binaryDataSize + this.query.length(); - } - - /** - * Returns the number of contained binary data parts. - * - * @return number of binary data parts - */ - public int size() { - return this.list.size(); - } - - /** - * Returns the specified binary data. - * - * @param index index of the binary data - * @return binary data - */ - public byte[] getBinaryData(final int index) { - return list.get(index); - } - - /** - * Returns the query. - * - * @return query - */ - public String getQuery() { - return query.toString(); - } - - /** - * Returns the string representation of this object. - * - * @return string representation - */ - public String toString() { - - try { - StringBuilder buffer = new StringBuilder(); - buffer.append(query).append("\r\n\r\n"); - - for (int i = 0; i < list.size(); i++) { - buffer.append(i).append("\t").append(list.get(i).length).append("\r\n"); - } - - return buffer.toString(); - - } catch (Exception e) { +public class SQLEncoding +{ + + /** + * UNCOMPRESSED Query + */ + private final StringBuilder query; + + /** + * List of binary data + */ + private final List list; + + /** + * Size of binary data + */ + private int binaryDataSize; + + /** + * (Constructor) Creates a new SQLEncoding object. + */ + public SQLEncoding() + { + this.query = new StringBuilder(); + this.list = new ArrayList<>(); + this.binaryDataSize = 0; + } + + /** + * Appends textual content to the query. + * + * @param seq + * textual content + */ + public void append(final CharSequence seq) + { + this.query.append(seq); + } + + /** + * Appends binary data to storage. + * + * @param bData + * binary data + */ + public void addBinaryData(final byte[] bData) + { + this.binaryDataSize += bData.length; + this.list.add(bData); + } + + /** + * Returns the size of the query. + * + * @return size of the query + */ + public int byteSize() + { + return this.binaryDataSize + this.query.length(); + } + + /** + * Returns the number of contained binary data parts. + * + * @return number of binary data parts + */ + public int size() + { + return this.list.size(); + } + /** + * Returns the specified binary data. + * + * @param index + * index of the binary data + * @return binary data + */ + public byte[] getBinaryData(final int index) + { + return list.get(index); } - return "<" + list.size() + ">\r\n" + query; - } + /** + * Returns the query. + * + * @return query + */ + public String getQuery() + { + return query.toString(); + } + + /** + * Returns the string representation of this object. + * + * @return string representation + */ + public String toString() + { + + try { + StringBuilder buffer = new StringBuilder(); + buffer.append(query).append("\r\n\r\n"); + + for (int i = 0; i < list.size(); i++) { + buffer.append(i).append("\t").append(list.get(i).length).append("\r\n"); + } + + return buffer.toString(); + + } + catch (Exception e) { + + } + + return "<" + list.size() + ">\r\n" + query; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java index 8704a0b8..dfaba63e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/codec/TimedSQLEncoder.java @@ -30,102 +30,120 @@ /** * This class encodes the diffs while collecting statistical information. */ -public class TimedSQLEncoder extends SQLEncoder { - - /** - * Temporary variable - used for storing the encoded size - */ - private long encodedSize; - - /** - * Temporary variable - used for storing the encoded sql size - */ - private long encodedSQLSize; - - /** - * (Constructor) Creates a new TimedSQLEncoder object. - * - * @param logger Reference to the logger - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - public TimedSQLEncoder(final Logger logger) throws ConfigurationException { - super(logger); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Initializes the encoding information. - */ - public void init() { - this.encodedSize = 0; - this.encodedSQLSize = 0; - } - - /** - * Returns the encoded size. - * - * @return encoded size - */ - public long getEncodedSize() { - return encodedSize; - } - - /** - * Returns the encoded sql size. - * - * @return encoded sql size - */ - public long getEncodedSQLSize() { - return encodedSQLSize; - } - - /*--------------------------------------------------------------------------*/ - - /** - * Encodes the diff. - * - * @param task Reference to the DiffTask - * @param diff Diff to encode - * @return Base 64 encoded Diff - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - * @throws EncodingException if the encoding failed - * @throws SQLConsumerException if an error occurred while encoding the diff - */ - protected String encodeDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { - - String encoding = super.encodeDiff(task, diff); - - this.encodedSize += encoding.length(); - - return encoding; - } - - @Override - protected byte[] binaryDiff(final Task task, final Diff diff) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { - - byte[] encoding = super.binaryDiff(task, diff); - this.encodedSize += encoding.length; - return encoding; - } - - @Override - public SQLEncoding[] encodeTask(final Task task) - throws ConfigurationException, UnsupportedEncodingException, - DecodingException, EncodingException, SQLConsumerException { - - SQLEncoding[] encoding = super.encodeTask(task); - - for (SQLEncoding sql : encoding) { - this.encodedSQLSize += sql.byteSize(); +public class TimedSQLEncoder + extends SQLEncoder +{ + + /** + * Temporary variable - used for storing the encoded size + */ + private long encodedSize; + + /** + * Temporary variable - used for storing the encoded sql size + */ + private long encodedSQLSize; + + /** + * (Constructor) Creates a new TimedSQLEncoder object. + * + * @param logger + * Reference to the logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + public TimedSQLEncoder(final Logger logger) throws ConfigurationException + { + super(logger); } - return encoding; - } + /*--------------------------------------------------------------------------*/ + + /** + * Initializes the encoding information. + */ + public void init() + { + this.encodedSize = 0; + this.encodedSQLSize = 0; + } + + /** + * Returns the encoded size. + * + * @return encoded size + */ + public long getEncodedSize() + { + return encodedSize; + } + + /** + * Returns the encoded sql size. + * + * @return encoded sql size + */ + public long getEncodedSQLSize() + { + return encodedSQLSize; + } + + /*--------------------------------------------------------------------------*/ + + /** + * Encodes the diff. + * + * @param task + * Reference to the DiffTask + * @param diff + * Diff to encode + * @return Base 64 encoded Diff + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + * @throws EncodingException + * if the encoding failed + * @throws SQLConsumerException + * if an error occurred while encoding the diff + */ + protected String encodeDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { + + String encoding = super.encodeDiff(task, diff); + + this.encodedSize += encoding.length(); + + return encoding; + } + + @Override + protected byte[] binaryDiff(final Task task, final Diff diff) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { + + byte[] encoding = super.binaryDiff(task, diff); + this.encodedSize += encoding.length; + return encoding; + } + + @Override + public SQLEncoding[] encodeTask(final Task task) + throws ConfigurationException, UnsupportedEncodingException, DecodingException, + EncodingException, SQLConsumerException + { + + SQLEncoding[] encoding = super.encodeTask(task); + + for (SQLEncoding sql : encoding) { + this.encodedSQLSize += sql.byteSize(); + } + + return encoding; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java index 34155fbd..f511088a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileArchiveWriter.java @@ -40,168 +40,190 @@ /** * This class writes the output to an archive. */ -public class DataFileArchiveWriter implements WriterInterface { - - /** - * File counter - */ - private int counter; - - /** - * Configuration parameter - maximum size of an output archive - */ - private final long LIMIT_SQL_ARCHIVE_SIZE; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** - * Reference to the output stream - */ - private OutputStream output; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** - * Configuration parameter - output path - */ - private final String PATH_OUTPUT_SQL_FILES; - - /** - * Reference to the output archive - */ - private File dataArchive; - - /** - * Reference to the SQLEncoder - */ - protected DataFileEncoder dataFileEncoder; - - /** - * Creates a new SQLArchiveWriter object. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - private DataFileArchiveWriter() throws ConfigurationException { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_ARCHIVE_SIZE = (Long) config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - PATH_OUTPUT_SQL_FILES = (String) config.getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - MODE_STATISTICAL_OUTPUT = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - - // Create sql file - counter = 0; - } - - - /** - * Creates a new SQLArchiveWriter object. - * - * @param outputName Name of the sql consumer - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public DataFileArchiveWriter(final String outputName) - throws IOException, ConfigurationException, LoggingException { - - this(); - this.outputName = outputName; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - */ - @Override - public void close() throws IOException { - this.output.close(); - this.output = null; - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - protected void init() throws ConfigurationException, LoggingException { - - this.dataFileEncoder = new DataFileEncoder(); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - // this.startTime = System.currentTimeMillis(); - try { - List data = dataFileEncoder.encodeTask(task); - - for (String d : data) { - this.output.write((d + ";").getBytes()); - this.output.flush(); - } +public class DataFileArchiveWriter + implements WriterInterface +{ + + /** + * File counter + */ + private int counter; + + /** + * Configuration parameter - maximum size of an output archive + */ + private final long LIMIT_SQL_ARCHIVE_SIZE; + + /** + * Configuration parameter - Flag, that indicates whether the statistical output is enabled or + * not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Reference to the output stream + */ + private OutputStream output; + + /** + * Name of the related sql consumer - used as prefix for the output filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_SQL_FILES; + + /** + * Reference to the output archive + */ + private File dataArchive; + + /** + * Reference to the SQLEncoder + */ + protected DataFileEncoder dataFileEncoder; + + /** + * Creates a new SQLArchiveWriter object. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + private DataFileArchiveWriter() throws ConfigurationException + { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_ARCHIVE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + PATH_OUTPUT_SQL_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + + // Create sql file + counter = 0; + } - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + /** + * Creates a new SQLArchiveWriter object. + * + * @param outputName + * Name of the sql consumer + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public DataFileArchiveWriter(final String outputName) + throws IOException, ConfigurationException, LoggingException + { + + this(); + this.outputName = outputName; + + init(); + writeHeader(); + } - if (this.dataArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { - writeHeader(); - } + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + */ + @Override + public void close() throws IOException + { + this.output.close(); + this.output = null; + } - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + protected void init() throws ConfigurationException, LoggingException + { + + this.dataFileEncoder = new DataFileEncoder(); + } - } else { - System.out.println(task); - } + /** + * This method will process the given DiffTask and send it to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + // this.startTime = System.currentTimeMillis(); + try { + List data = dataFileEncoder.encodeTask(task); + + for (String d : data) { + this.output.write((d + ";").getBytes()); + this.output.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.dataArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } + else { + System.out.println(task); + } - } catch (EncodingException | DecodingException e) { - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - } - } - - /** - * Creates a new output file and writes the header information. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws IOException if an error occurred while writing a file - */ - protected void writeHeader() throws ConfigurationException, IOException { - - if (this.output != null) { - close(); + } + catch (EncodingException | DecodingException e) { + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } } - this.counter++; - String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + counter; - this.output = OutputFactory.getOutputStream(filePath); - this.dataArchive = new File(filePath); - this.output.flush(); - } + /** + * Creates a new output file and writes the header information. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws IOException + * if an error occurred while writing a file + */ + protected void writeHeader() throws ConfigurationException, IOException + { + + if (this.output != null) { + close(); + } + this.counter++; + + String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + counter; + this.output = OutputFactory.getOutputStream(filePath); + this.dataArchive = new File(filePath); + this.output.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java index 8402c0ed..0b43ce14 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/DataFileWriter.java @@ -44,170 +44,193 @@ /** * This class writes the output to a data file (not an sql file) */ -public class DataFileWriter implements WriterInterface { - - /** - * File counter - */ - private int fileCounter; - - /** - * Configuration parameter - maximum size of an output file - */ - private final long LIMIT_SQL_FILE_SIZE; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** - * Configuration parameter - output path - */ - private final String PATH_OUTPUT_DATA_FILES; - - /** - * Reference to the DataFileEncoder - */ - protected DataFileEncoder dataFileEncoder; - - /** - * Reference to the output file - */ - private File dataFile; - - /** - * Reference to the file writer - */ - private Writer writer; - - private final String WIKIPEDIA_ENCODING; - - /** - * Creates a new SQLFileWriter object. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - private DataFileWriter() throws ConfigurationException { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_FILE_SIZE = (Long) config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); - PATH_OUTPUT_DATA_FILES = (String) config.getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - MODE_STATISTICAL_OUTPUT = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - // Create sql file - fileCounter = 0; - } - - - /** - * Creates a new SQLFileWriter object. - * - * @param outputName Name of the sql consumer - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public DataFileWriter(final String outputName) - throws IOException, ConfigurationException, LoggingException { - - this(); - this.outputName = outputName; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - */ - @Override - public void close() throws IOException { - this.writer.close(); - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - protected void init() throws ConfigurationException, LoggingException { - - this.dataFileEncoder = new DataFileEncoder(); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - try { - List data = dataFileEncoder.encodeTask(task); - - for (String d : data) { - this.writer.write(d + ";"); - this.writer.flush(); - } +public class DataFileWriter + implements WriterInterface +{ + + /** + * File counter + */ + private int fileCounter; + + /** + * Configuration parameter - maximum size of an output file + */ + private final long LIMIT_SQL_FILE_SIZE; + + /** + * Configuration parameter - Flag, that indicates whether the statistical output is enabled or + * not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Name of the related sql consumer - used as prefix for the output filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_DATA_FILES; + + /** + * Reference to the DataFileEncoder + */ + protected DataFileEncoder dataFileEncoder; + + /** + * Reference to the output file + */ + private File dataFile; + + /** + * Reference to the file writer + */ + private Writer writer; + + private final String WIKIPEDIA_ENCODING; + + /** + * Creates a new SQLFileWriter object. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + private DataFileWriter() throws ConfigurationException + { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_FILE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); + PATH_OUTPUT_DATA_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + // Create sql file + fileCounter = 0; + } - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + /** + * Creates a new SQLFileWriter object. + * + * @param outputName + * Name of the sql consumer + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public DataFileWriter(final String outputName) + throws IOException, ConfigurationException, LoggingException + { + + this(); + this.outputName = outputName; + + init(); + writeHeader(); + } - if (this.dataFile.length() > LIMIT_SQL_FILE_SIZE) { - writeHeader(); - } + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + */ + @Override + public void close() throws IOException + { + this.writer.close(); + } - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + protected void init() throws ConfigurationException, LoggingException + { + + this.dataFileEncoder = new DataFileEncoder(); + } - } else { - System.out.println(task); - } + /** + * This method will process the given DiffTask and send it to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + try { + List data = dataFileEncoder.encodeTask(task); + + for (String d : data) { + this.writer.write(d + ";"); + this.writer.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.dataFile.length() > LIMIT_SQL_FILE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } + else { + System.out.println(task); + } - } catch (DecodingException | EncodingException e) { + } + catch (DecodingException | EncodingException e) { - throw ErrorFactory.createSQLConsumerException(ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } } - } - /** - * Creates a new output file and writes the header information. - * - * @throws IOException if an error occurred while writing a file - */ - protected void writeHeader() throws IOException { + /** + * Creates a new output file and writes the header information. + * + * @throws IOException + * if an error occurred while writing a file + */ + protected void writeHeader() throws IOException + { + + if (writer != null) { + writer.close(); + } - if (writer != null) { - writer.close(); + this.fileCounter++; + String filePath = PATH_OUTPUT_DATA_FILES + this.outputName + "_" + fileCounter + ".csv"; + this.dataFile = new File(filePath); + this.writer = new BufferedWriter(new OutputStreamWriter( + new BufferedOutputStream(new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); + this.writer.flush(); } - - this.fileCounter++; - String filePath = PATH_OUTPUT_DATA_FILES + this.outputName + "_" + fileCounter + ".csv"; - this.dataFile = new File(filePath); - this.writer = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream( - new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); - this.writer.flush(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java index bf742030..58edd492 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/OutputFactory.java @@ -28,71 +28,81 @@ import org.dkpro.jwpl.revisionmachine.difftool.config.ConfigurationManager; import org.dkpro.jwpl.revisionmachine.difftool.data.OutputType; -public class OutputFactory { +public class OutputFactory +{ - private static String PATH_PROGRAM_7ZIP = null; - private static OutputType MODE_OUTPUT = null; - private static ConfigurationManager config = null; + private static String PATH_PROGRAM_7ZIP = null; + private static OutputType MODE_OUTPUT = null; + private static ConfigurationManager config = null; - static { - try { - config = ConfigurationManager.getInstance(); - MODE_OUTPUT = (OutputType) config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); - } catch (ConfigurationException e) { - e.printStackTrace(); - System.exit(-1); + static { + try { + config = ConfigurationManager.getInstance(); + MODE_OUTPUT = (OutputType) config.getConfigParameter(ConfigurationKeys.MODE_OUTPUT); + } + catch (ConfigurationException e) { + e.printStackTrace(); + System.exit(-1); + } } - } - private static OutputStream compressWith7Zip(final String archivePath) - throws ConfigurationException { + private static OutputStream compressWith7Zip(final String archivePath) + throws ConfigurationException + { - PATH_PROGRAM_7ZIP = (String) config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); + PATH_PROGRAM_7ZIP = (String) config.getConfigParameter(ConfigurationKeys.PATH_PROGRAM_7ZIP); - if (PATH_PROGRAM_7ZIP == null) { - throw ErrorFactory.createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); - } + if (PATH_PROGRAM_7ZIP == null) { + throw ErrorFactory + .createConfigurationException(ErrorKeys.CONFIGURATION_PARAMETER_UNDEFINED); + } - try { - Runtime runtime = Runtime.getRuntime(); - Process p = runtime.exec(PATH_PROGRAM_7ZIP + " a -t7z -si " + archivePath); - return p.getOutputStream(); + try { + Runtime runtime = Runtime.getRuntime(); + Process p = runtime.exec(PATH_PROGRAM_7ZIP + " a -t7z -si " + archivePath); + return p.getOutputStream(); - } catch (Exception e) { - throw new RuntimeException(e); + } + catch (Exception e) { + throw new RuntimeException(e); + } } - } - private static OutputStream compressWithBZip2(final String archivePath) { + private static OutputStream compressWithBZip2(final String archivePath) + { - OutputStream output = null; - try { - output = new Bzip2Archiver().getCompressionStream(archivePath); - } catch (IOException e) { - e.printStackTrace(); + OutputStream output = null; + try { + output = new Bzip2Archiver().getCompressionStream(archivePath); + } + catch (IOException e) { + e.printStackTrace(); + } + return output; } - return output; - } - public static OutputStream getOutputStream(final String archivePath) - throws ConfigurationException { + public static OutputStream getOutputStream(final String archivePath) + throws ConfigurationException + { - switch (MODE_OUTPUT) { - case SEVENZIP: - if ((Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)) { - return compressWith7Zip(archivePath + ".csv.7z"); - } else { - return compressWith7Zip(archivePath + ".sql.7z"); - } - case BZIP2: - if ((Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)) { - return compressWithBZip2(archivePath + ".csv.bz2"); - } else { - return compressWithBZip2(archivePath + ".sql.bz2"); + switch (MODE_OUTPUT) { + case SEVENZIP: + if ((Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)) { + return compressWith7Zip(archivePath + ".csv.7z"); + } + else { + return compressWith7Zip(archivePath + ".sql.7z"); + } + case BZIP2: + if ((Boolean) config.getConfigParameter(ConfigurationKeys.MODE_DATAFILE_OUTPUT)) { + return compressWithBZip2(archivePath + ".csv.bz2"); + } + else { + return compressWithBZip2(archivePath + ".sql.bz2"); + } + default: + throw ErrorFactory.createConfigurationException( + ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); } - default: - throw ErrorFactory - .createConfigurationException(ErrorKeys.DELTA_CONSUMERS_SQL_WRITER_OUTPUTFACTORY_ILLEGAL_OUTPUTMODE_VALUE); } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java index 30af5626..f9ce478a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLArchiveWriter.java @@ -43,200 +43,224 @@ /** * This class writes the output to an archive. */ -public class SQLArchiveWriter implements WriterInterface { - - /** - * File counter - */ - private int counter; - - /** - * Configuration parameter - maximum size of an output archive - */ - private final long LIMIT_SQL_ARCHIVE_SIZE; - - /** - * Reference to the logger - */ - protected Logger logger; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** - * Reference to the output stream - */ - private OutputStream output; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** - * Configuration parameter - output path - */ - private final String PATH_OUTPUT_SQL_FILES; - - /** - * Reference to the output archive - */ - private File sqlArchive; - - /** - * Reference to the SQLEncoder - */ - protected SQLEncoderInterface sqlEncoder; - - /** - * Configuration parameter - Charset name of the input data - */ - private final String WIKIPEDIA_ENCODING; - - /** - * Creates a new SQLArchiveWriter object. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - private SQLArchiveWriter() throws ConfigurationException { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_ARCHIVE_SIZE = (Long) config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); - PATH_OUTPUT_SQL_FILES = (String) config.getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - MODE_STATISTICAL_OUTPUT = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - // Create sql file - counter = 0; - } - - - /** - * Creates a new SQLArchiveWriter object. - * - * @param outputName Name of the sql consumer - * @param logger Reference to a logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public SQLArchiveWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException { - - this(); - - this.outputName = outputName; - this.logger = logger; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - */ - @Override - public void close() throws IOException { - this.output.close(); - this.output = null; - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - protected void init() throws ConfigurationException, LoggingException { - - this.sqlEncoder = new SQLEncoder(logger); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - // this.startTime = System.currentTimeMillis(); - try { - SQLEncoding[] encoding = this.sqlEncoder.encodeTask(task); - - String s; - for (SQLEncoding sql : encoding) { - s = sql.getQuery() + "\r\n"; - this.output.write(s.getBytes(WIKIPEDIA_ENCODING)); - this.output.flush(); - } +public class SQLArchiveWriter + implements WriterInterface +{ + + /** + * File counter + */ + private int counter; + + /** + * Configuration parameter - maximum size of an output archive + */ + private final long LIMIT_SQL_ARCHIVE_SIZE; + + /** + * Reference to the logger + */ + protected Logger logger; + + /** + * Configuration parameter - Flag, that indicates whether the statistical output is enabled or + * not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Reference to the output stream + */ + private OutputStream output; + + /** + * Name of the related sql consumer - used as prefix for the output filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_SQL_FILES; + + /** + * Reference to the output archive + */ + private File sqlArchive; + + /** + * Reference to the SQLEncoder + */ + protected SQLEncoderInterface sqlEncoder; + + /** + * Configuration parameter - Charset name of the input data + */ + private final String WIKIPEDIA_ENCODING; + + /** + * Creates a new SQLArchiveWriter object. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + private SQLArchiveWriter() throws ConfigurationException + { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_ARCHIVE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_ARCHIVE_SIZE); + PATH_OUTPUT_SQL_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + // Create sql file + counter = 0; + } - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + /** + * Creates a new SQLArchiveWriter object. + * + * @param outputName + * Name of the sql consumer + * @param logger + * Reference to a logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public SQLArchiveWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException + { + + this(); + + this.outputName = outputName; + this.logger = logger; + + init(); + writeHeader(); + } - if (this.sqlArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { - writeHeader(); - } + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + */ + @Override + public void close() throws IOException + { + this.output.close(); + this.output = null; + } - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + protected void init() throws ConfigurationException, LoggingException + { + + this.sqlEncoder = new SQLEncoder(logger); + } - } else { - System.out.println(task); - } + /** + * This method will process the given DiffTask and send it to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + // this.startTime = System.currentTimeMillis(); + try { + SQLEncoding[] encoding = this.sqlEncoder.encodeTask(task); + + String s; + for (SQLEncoding sql : encoding) { + s = sql.getQuery() + "\r\n"; + this.output.write(s.getBytes(WIKIPEDIA_ENCODING)); + this.output.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.sqlArchive.length() > LIMIT_SQL_ARCHIVE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } + else { + System.out.println(task); + } - } catch (DecodingException | EncodingException e) { + } + catch (DecodingException | EncodingException e) { - throw ErrorFactory.createSQLConsumerException( - ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); - } - } - - /** - * Creates a new output file and writes the header information. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws IOException if an error occurred while writing a file - */ - protected void writeHeader() throws ConfigurationException, IOException { - - if (this.output != null) { - close(); + } } - this.counter++; + /** + * Creates a new output file and writes the header information. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws IOException + * if an error occurred while writing a file + */ + protected void writeHeader() throws ConfigurationException, IOException + { + + if (this.output != null) { + close(); + } - String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + counter; + this.counter++; - this.output = OutputFactory.getOutputStream(filePath); + String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + counter; - // System.out.println(filePath); - SQLConsumerLogMessages.logFileCreation(logger, filePath); + this.output = OutputFactory.getOutputStream(filePath); - this.sqlArchive = new File(filePath); + // System.out.println(filePath); + SQLConsumerLogMessages.logFileCreation(logger, filePath); - String[] revTable = this.sqlEncoder.getTable(); - for (String sTable : revTable) { - String curLine = sTable + "\r\n"; - byte[] bytes = curLine.getBytes(WIKIPEDIA_ENCODING); - this.output.write(bytes); - } + this.sqlArchive = new File(filePath); - this.output.flush(); - } + String[] revTable = this.sqlEncoder.getTable(); + for (String sTable : revTable) { + String curLine = sTable + "\r\n"; + byte[] bytes = curLine.getBytes(WIKIPEDIA_ENCODING); + this.output.write(bytes); + } + + this.output.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java index 420553f2..3b4a1cb5 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLDatabaseWriter.java @@ -43,143 +43,166 @@ /** * This class writes the output to a database. */ -public class SQLDatabaseWriter implements WriterInterface { - - /** - * Reference to the database connection - */ - private Connection connection; - - /** - * Reference to the logger - */ - protected final Logger logger; - - /** - * Reference to the sql encoder - */ - protected SQLEncoderInterface sqlEncoder; - - /** - * (Constructor) Creates a new SQLDatabaseWriter object. - * - * @param logger Reference to the logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public SQLDatabaseWriter(final Logger logger) throws ConfigurationException, LoggingException { - - this.logger = logger; - - ConfigurationManager config = ConfigurationManager.getInstance(); - - String host = (String) config.getConfigParameter(ConfigurationKeys.SQL_HOST); - String user = (String) config.getConfigParameter(ConfigurationKeys.SQL_USERNAME); - String password = (String) config.getConfigParameter(ConfigurationKeys.SQL_PASSWORD); - String sTable = (String) config.getConfigParameter(ConfigurationKeys.SQL_DATABASE); - - try { - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); - - this.connection = DriverManager.getConnection("jdbc:mysql://" - + host + "/" + sTable, user, password); +public class SQLDatabaseWriter + implements WriterInterface +{ + + /** + * Reference to the database connection + */ + private Connection connection; + + /** + * Reference to the logger + */ + protected final Logger logger; + + /** + * Reference to the sql encoder + */ + protected SQLEncoderInterface sqlEncoder; + + /** + * (Constructor) Creates a new SQLDatabaseWriter object. + * + * @param logger + * Reference to the logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public SQLDatabaseWriter(final Logger logger) throws ConfigurationException, LoggingException + { + + this.logger = logger; + + ConfigurationManager config = ConfigurationManager.getInstance(); + + String host = (String) config.getConfigParameter(ConfigurationKeys.SQL_HOST); + String user = (String) config.getConfigParameter(ConfigurationKeys.SQL_USERNAME); + String password = (String) config.getConfigParameter(ConfigurationKeys.SQL_PASSWORD); + String sTable = (String) config.getConfigParameter(ConfigurationKeys.SQL_DATABASE); + + try { + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + this.connection = DriverManager.getConnection("jdbc:mysql://" + host + "/" + sTable, + user, password); + + init(); + writeHeader(); + + } + catch (ClassNotFoundException | SQLException e) { + throw new ConfigurationException(e); + } + } - init(); - writeHeader(); + /** + * This method will close the connection to the output. + * + * @throws SQLException + * if problems occurred while closing the connection to the database. + */ + @Override + public void close() throws SQLException + { + this.connection.close(); + this.connection = null; + } - } catch (ClassNotFoundException | SQLException e) { - throw new ConfigurationException(e); + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + protected void init() throws ConfigurationException, LoggingException + { + this.sqlEncoder = new SQLEncoder(logger); } - } - - /** - * This method will close the connection to the output. - * - * @throws SQLException if problems occurred while closing the connection to the - * database. - */ - @Override - public void close() throws SQLException { - this.connection.close(); - this.connection = null; - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - protected void init() throws ConfigurationException, LoggingException { - this.sqlEncoder = new SQLEncoder(logger); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - int i = -1; - SQLEncoding[] queries = null; - - try { - queries = sqlEncoder.encodeTask(task); - - Statement query; - int size = queries.length; - for (i = 0; i < size; i++) { - - query = connection.createStatement(); - query.executeUpdate(queries[i].getQuery()); - query.close(); - } - // System.out.println(task.toString()); - - } catch (SQLException e) { - - String q; - if (queries == null || queries.length <= i || queries[i] == null) { - q = ""; - } else { - q = queries[i].toString(); - } - throw ErrorFactory.createSQLConsumerException(ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, q, e); - } catch (DecodingException e) { - throw ErrorFactory.createSQLConsumerException(ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, e); - } catch (EncodingException e) { - throw ErrorFactory.createSQLConsumerException(ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + + /** + * This method will process the given DiffTask and send it to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + int i = -1; + SQLEncoding[] queries = null; + + try { + queries = sqlEncoder.encodeTask(task); + + Statement query; + int size = queries.length; + for (i = 0; i < size; i++) { + + query = connection.createStatement(); + query.executeUpdate(queries[i].getQuery()); + query.close(); + } + // System.out.println(task.toString()); + + } + catch (SQLException e) { + + String q; + if (queries == null || queries.length <= i || queries[i] == null) { + q = ""; + } + else { + q = queries[i].toString(); + } + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, q, e); + } + catch (DecodingException e) { + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_DATABASEWRITER_EXCEPTION, e); + } + catch (EncodingException e) { + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } } - } - /** - * Retrieves the encoded sql orders and executes them. - * - * @throws SQLException if an error occurred while accessing the database - */ - private void writeHeader() throws SQLException { + /** + * Retrieves the encoded sql orders and executes them. + * + * @throws SQLException + * if an error occurred while accessing the database + */ + private void writeHeader() throws SQLException + { - Statement query; - String[] revTableHeaderQueries; + Statement query; + String[] revTableHeaderQueries; - revTableHeaderQueries = sqlEncoder.getTable(); + revTableHeaderQueries = sqlEncoder.getTable(); - //commit revision table header - for (String revTableHeaderQuery : revTableHeaderQueries) { - query = connection.createStatement(); + // commit revision table header + for (String revTableHeaderQuery : revTableHeaderQueries) { + query = connection.createStatement(); - query.executeUpdate(revTableHeaderQuery); - query.close(); - } + query.executeUpdate(revTableHeaderQuery); + query.close(); + } - } + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java index a79694b8..80cbf270 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/SQLFileWriter.java @@ -47,190 +47,212 @@ /** * This class writes the output to a file. */ -public class SQLFileWriter implements WriterInterface { - - /** - * File counter - */ - private int fileCounter; - - /** - * Configuration parameter - maximum size of an output file - */ - private final long LIMIT_SQL_FILE_SIZE; - - /** - * Reference to the logger - */ - protected Logger logger; - - /** - * Configuration parameter - Flag, that indicates whether the statistical - * output is enabled or not - */ - private final boolean MODE_STATISTICAL_OUTPUT; - - /** - * Name of the related sql consumer - used as prefix for the output - * filenames - */ - private String outputName; - - /** - * Configuration parameter - output path - */ - private final String PATH_OUTPUT_SQL_FILES; - - /** - * Reference to the SQLEncoder - */ - protected SQLEncoderInterface sqlEncoder; - - /** - * Reference to the output file - */ - private File sqlFile; - - /** - * Reference to the file writer - */ - private Writer writer; - - private final String WIKIPEDIA_ENCODING; - - /** - * Creates a new SQLFileWriter object. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - */ - private SQLFileWriter() throws ConfigurationException { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - LIMIT_SQL_FILE_SIZE = (Long) config.getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); - PATH_OUTPUT_SQL_FILES = (String) config.getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); - MODE_STATISTICAL_OUTPUT = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - - // Create sql file - fileCounter = 0; - } - - - /** - * Creates a new SQLFileWriter object. - * - * @param outputName Name of the sql consumer - * @param logger Reference to a logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public SQLFileWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException { - - this(); - - this.outputName = outputName; - this.logger = logger; - - init(); - writeHeader(); - } - - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - */ - @Override - public void close() throws IOException { - this.writer.close(); - } - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - protected void init() throws ConfigurationException, LoggingException { - - this.sqlEncoder = new SQLEncoder(logger); - } - - /** - * This method will process the given DiffTask and send it to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - try { - SQLEncoding[] encoding = sqlEncoder.encodeTask(task); - - for (SQLEncoding sql : encoding) { - this.writer.write(sql.getQuery() + "\r\n"); - this.writer.flush(); - } +public class SQLFileWriter + implements WriterInterface +{ + + /** + * File counter + */ + private int fileCounter; + + /** + * Configuration parameter - maximum size of an output file + */ + private final long LIMIT_SQL_FILE_SIZE; + + /** + * Reference to the logger + */ + protected Logger logger; + + /** + * Configuration parameter - Flag, that indicates whether the statistical output is enabled or + * not + */ + private final boolean MODE_STATISTICAL_OUTPUT; + + /** + * Name of the related sql consumer - used as prefix for the output filenames + */ + private String outputName; + + /** + * Configuration parameter - output path + */ + private final String PATH_OUTPUT_SQL_FILES; + + /** + * Reference to the SQLEncoder + */ + protected SQLEncoderInterface sqlEncoder; + + /** + * Reference to the output file + */ + private File sqlFile; + + /** + * Reference to the file writer + */ + private Writer writer; + + private final String WIKIPEDIA_ENCODING; + + /** + * Creates a new SQLFileWriter object. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + */ + private SQLFileWriter() throws ConfigurationException + { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + LIMIT_SQL_FILE_SIZE = (Long) config + .getConfigParameter(ConfigurationKeys.LIMIT_SQL_FILE_SIZE); + PATH_OUTPUT_SQL_FILES = (String) config + .getConfigParameter(ConfigurationKeys.PATH_OUTPUT_SQL_FILES); + MODE_STATISTICAL_OUTPUT = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_STATISTICAL_OUTPUT); + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + + // Create sql file + fileCounter = 0; + } - if (task.getTaskType() == TaskTypes.TASK_FULL - || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + /** + * Creates a new SQLFileWriter object. + * + * @param outputName + * Name of the sql consumer + * @param logger + * Reference to a logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public SQLFileWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException + { + + this(); + + this.outputName = outputName; + this.logger = logger; + + init(); + writeHeader(); + } - if (this.sqlFile.length() > LIMIT_SQL_FILE_SIZE) { - writeHeader(); - } + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + */ + @Override + public void close() throws IOException + { + this.writer.close(); + } - if (!MODE_STATISTICAL_OUTPUT) { - System.out.println(task); - } + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + protected void init() throws ConfigurationException, LoggingException + { + + this.sqlEncoder = new SQLEncoder(logger); + } - } else { - System.out.println(task); - } + /** + * This method will process the given DiffTask and send it to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + try { + SQLEncoding[] encoding = sqlEncoder.encodeTask(task); + + for (SQLEncoding sql : encoding) { + this.writer.write(sql.getQuery() + "\r\n"); + this.writer.flush(); + } + + if (task.getTaskType() == TaskTypes.TASK_FULL + || task.getTaskType() == TaskTypes.TASK_PARTIAL_LAST) { + + if (this.sqlFile.length() > LIMIT_SQL_FILE_SIZE) { + writeHeader(); + } + + if (!MODE_STATISTICAL_OUTPUT) { + System.out.println(task); + } + + } + else { + System.out.println(task); + } - } catch (DecodingException | EncodingException e) { - throw ErrorFactory.createSQLConsumerException(ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } + catch (DecodingException | EncodingException e) { + throw ErrorFactory.createSQLConsumerException( + ErrorKeys.DIFFTOOL_SQLCONSUMER_FILEWRITER_EXCEPTION, e); + } } - } - - /** - * Creates a new output file and writes the header information. - * - * @throws IOException if an error occurred while writing a file - */ - protected void writeHeader() throws IOException { - if (writer != null) { - writer.close(); - } + /** + * Creates a new output file and writes the header information. + * + * @throws IOException + * if an error occurred while writing a file + */ + protected void writeHeader() throws IOException + { + + if (writer != null) { + writer.close(); + } - this.fileCounter++; - String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" - + fileCounter + ".sql"; + this.fileCounter++; + String filePath = PATH_OUTPUT_SQL_FILES + this.outputName + "_" + fileCounter + ".sql"; - SQLConsumerLogMessages.logFileCreation(logger, filePath); + SQLConsumerLogMessages.logFileCreation(logger, filePath); - this.sqlFile = new File(filePath); + this.sqlFile = new File(filePath); - this.writer = new BufferedWriter(new OutputStreamWriter(new BufferedOutputStream( - new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); + this.writer = new BufferedWriter(new OutputStreamWriter( + new BufferedOutputStream(new FileOutputStream(filePath)), WIKIPEDIA_ENCODING)); + String[] revTable = this.sqlEncoder.getTable(); - String[] revTable = this.sqlEncoder.getTable(); + for (String sTable : revTable) { + this.writer.write(sTable + "\r\n"); + } - for (String sTable : revTable) { - this.writer.write(sTable + "\r\n"); + this.writer.flush(); } - - this.writer.flush(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java index 4ce53048..aead3c75 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLArchiveWriter.java @@ -32,113 +32,126 @@ import org.slf4j.event.Level; /** - * This class writes the output to an archive while collecting statistical - * information. + * This class writes the output to an archive while collecting statistical information. */ -public class TimedSQLArchiveWriter extends SQLArchiveWriter { - - /** - * Reference to the logger - */ - private final Logger outputLogger; - - /** - * Temporary variable - used for storing the time needed to encode a task - */ - private long processingTimeSQL; - - /** - * Reference to the sql encoder - */ - private TimedSQLEncoder sqlEncoder; - - - /** - * (Constructor) Creates a new TimedSQLFileWriter object. - * - * @param outputName Name of the sql consumer - * @param logger Reference to a logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public TimedSQLArchiveWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException { - - super(outputName, logger); - this.outputLogger = LoggingFactory.getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - @Override - protected void init() throws ConfigurationException, LoggingException { - - this.sqlEncoder = new TimedSQLEncoder(logger); - super.sqlEncoder = this.sqlEncoder; - } - - /*--------------------------------------------------------------------------*/ - - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) - throws ConfigurationException, IOException, SQLConsumerException { - - long startTime = System.currentTimeMillis(); - - TaskTypes type = task.getTaskType(); - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - - this.sqlEncoder.init(); - this.processingTimeSQL = 0; +public class TimedSQLArchiveWriter + extends SQLArchiveWriter +{ + + /** + * Reference to the logger + */ + private final Logger outputLogger; + + /** + * Temporary variable - used for storing the time needed to encode a task + */ + private long processingTimeSQL; + + /** + * Reference to the sql encoder + */ + private TimedSQLEncoder sqlEncoder; + + /** + * (Constructor) Creates a new TimedSQLFileWriter object. + * + * @param outputName + * Name of the sql consumer + * @param logger + * Reference to a logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public TimedSQLArchiveWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException + { + + super(outputName, logger); + this.outputLogger = LoggingFactory.getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); } - super.process(task); + /*--------------------------------------------------------------------------*/ + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + @Override + protected void init() throws ConfigurationException, LoggingException + { + + this.sqlEncoder = new TimedSQLEncoder(logger); + super.sqlEncoder = this.sqlEncoder; + } + + /*--------------------------------------------------------------------------*/ + + /** + * This method will process the given DiffTask and send him to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + long startTime = System.currentTimeMillis(); + + TaskTypes type = task.getTaskType(); - this.processingTimeSQL += System.currentTimeMillis() - startTime; + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + this.sqlEncoder.init(); + this.processingTimeSQL = 0; + } - ArticleInformation info = task.getHeader(); - info.setEncodedSize(this.sqlEncoder.getEncodedSize()); - info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); - info.setExitingTime(System.currentTimeMillis()); - info.setProcessingTimeSQL(processingTimeSQL); + super.process(task); - String succesReport = info.toString(); - this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); + this.processingTimeSQL += System.currentTimeMillis() - startTime; + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + + ArticleInformation info = task.getHeader(); + info.setEncodedSize(this.sqlEncoder.getEncodedSize()); + info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); + info.setExitingTime(System.currentTimeMillis()); + info.setProcessingTimeSQL(processingTimeSQL); + + String succesReport = info.toString(); + this.outputLogger.logMessage(Level.INFO, "\r\n" + succesReport); + } } - } - - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - */ - @Override - public void close() throws IOException { - try { - super.close(); - } finally { - this.outputLogger.flush(); - //this.outputLogger.close(); + + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + */ + @Override + public void close() throws IOException + { + try { + super.close(); + } + finally { + this.outputLogger.flush(); + // this.outputLogger.close(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java index 9931df62..7afd347e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLDatabaseWriter.java @@ -33,112 +33,123 @@ import org.slf4j.event.Level; /** - * This class writes the output to a database while collecting statistical - * information. + * This class writes the output to a database while collecting statistical information. */ -public class TimedSQLDatabaseWriter extends SQLDatabaseWriter { - - /** - * Reference to the logger - */ - private final Logger outputLogger; - - /** - * Temporary variable - used for storing the time needed to encode a task - */ - private long processingTimeSQL; - - /** - * Reference to the sql encoder - */ - private TimedSQLEncoder sqlEncoder; - - - /** - * Creates a new TimedSQLDatabaseWriter object. - * - * @param logger Reference to the logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public TimedSQLDatabaseWriter(final Logger logger) - throws ConfigurationException, LoggingException { - - super(logger); - this.outputLogger = LoggingFactory - .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - @Override - protected void init() throws ConfigurationException, LoggingException { - - this.sqlEncoder = new TimedSQLEncoder(logger); - super.sqlEncoder = this.sqlEncoder; - } - - /*--------------------------------------------------------------------------*/ - - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - long startTime = System.currentTimeMillis(); - - TaskTypes type = task.getTaskType(); - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - - this.sqlEncoder.init(); - this.processingTimeSQL = 0; +public class TimedSQLDatabaseWriter + extends SQLDatabaseWriter +{ + + /** + * Reference to the logger + */ + private final Logger outputLogger; + + /** + * Temporary variable - used for storing the time needed to encode a task + */ + private long processingTimeSQL; + + /** + * Reference to the sql encoder + */ + private TimedSQLEncoder sqlEncoder; + + /** + * Creates a new TimedSQLDatabaseWriter object. + * + * @param logger + * Reference to the logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public TimedSQLDatabaseWriter(final Logger logger) + throws ConfigurationException, LoggingException + { + + super(logger); + this.outputLogger = LoggingFactory.getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); } - super.process(task); + /*--------------------------------------------------------------------------*/ + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + @Override + protected void init() throws ConfigurationException, LoggingException + { + + this.sqlEncoder = new TimedSQLEncoder(logger); + super.sqlEncoder = this.sqlEncoder; + } + + /*--------------------------------------------------------------------------*/ + + /** + * This method will process the given DiffTask and send him to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + long startTime = System.currentTimeMillis(); + + TaskTypes type = task.getTaskType(); - this.processingTimeSQL += System.currentTimeMillis() - startTime; + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + this.sqlEncoder.init(); + this.processingTimeSQL = 0; + } - ArticleInformation info = task.getHeader(); - info.setEncodedSize(this.sqlEncoder.getEncodedSize()); - info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); - info.setExitingTime(System.currentTimeMillis()); - info.setProcessingTimeSQL(processingTimeSQL); + super.process(task); - String successReport = info.toString(); - this.outputLogger.logMessage(Level.INFO, "\r\n" + successReport); + this.processingTimeSQL += System.currentTimeMillis() - startTime; + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + + ArticleInformation info = task.getHeader(); + info.setEncodedSize(this.sqlEncoder.getEncodedSize()); + info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); + info.setExitingTime(System.currentTimeMillis()); + info.setProcessingTimeSQL(processingTimeSQL); + + String successReport = info.toString(); + this.outputLogger.logMessage(Level.INFO, "\r\n" + successReport); + } } - } - - /** - * This method will close the connection to the output. - * - * @throws SQLException if problems occurred while closing the connection to the - * database. - */ - @Override - public void close() throws SQLException { - try { - super.close(); - } finally { - this.outputLogger.close(); + + /** + * This method will close the connection to the output. + * + * @throws SQLException + * if problems occurred while closing the connection to the database. + */ + @Override + public void close() throws SQLException + { + try { + super.close(); + } + finally { + this.outputLogger.close(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java index 6cdd1e65..c8d7eb3e 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/consumer/dump/writer/TimedSQLFileWriter.java @@ -32,109 +32,123 @@ import org.slf4j.event.Level; /** - * This class writes the output to a file while collecting statistical - * information. + * This class writes the output to a file while collecting statistical information. */ -public class TimedSQLFileWriter extends SQLFileWriter { - - /** - * Reference to the logger - */ - private final Logger outputLogger; - - /** - * Temporary variable - used for storing the time needed to encode a task - */ - private long processingTimeSQL; - - /** - * Reference to the sql encoder - */ - private TimedSQLEncoder sqlEncoder; - - /** - * (Constructor) Creates a new TimedSQLFileWriter object. - * - * @param outputName Name of the sql consumer - * @param logger Reference to a logger - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - public TimedSQLFileWriter(final String outputName, final Logger logger) - throws IOException, ConfigurationException, LoggingException { - - super(outputName, logger); - this.outputLogger = LoggingFactory - .getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); - } - - /*--------------------------------------------------------------------------*/ - - /** - * Creates the sql encoder. - * - * @throws ConfigurationException if an error occurred while accessing the configuration - * @throws LoggingException if an error occurred while accessing the logger - */ - @Override - protected void init() throws ConfigurationException, LoggingException { - - this.sqlEncoder = new TimedSQLEncoder(logger); - super.sqlEncoder = this.sqlEncoder; - } - - /** - * This method will process the given DiffTask and send him to the specified - * output. - * - * @param task DiffTask - * @throws ConfigurationException if problems occurred while initializing the components - * @throws IOException if problems occurred while writing the output (to file or - * archive) - * @throws SQLConsumerException if problems occurred while writing the output (to the sql - * producer database) - */ - @Override - public void process(final Task task) throws ConfigurationException, IOException, SQLConsumerException { - - long startTime = System.currentTimeMillis(); - - TaskTypes type = task.getTaskType(); - - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - - this.sqlEncoder.init(); - this.processingTimeSQL = 0; +public class TimedSQLFileWriter + extends SQLFileWriter +{ + + /** + * Reference to the logger + */ + private final Logger outputLogger; + + /** + * Temporary variable - used for storing the time needed to encode a task + */ + private long processingTimeSQL; + + /** + * Reference to the sql encoder + */ + private TimedSQLEncoder sqlEncoder; + + /** + * (Constructor) Creates a new TimedSQLFileWriter object. + * + * @param outputName + * Name of the sql consumer + * @param logger + * Reference to a logger + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + public TimedSQLFileWriter(final String outputName, final Logger logger) + throws IOException, ConfigurationException, LoggingException + { + + super(outputName, logger); + this.outputLogger = LoggingFactory.getLogger(LoggingFactory.NAME_ARTICLE_OUTPUT_LOGGER); } - super.process(task); + /*--------------------------------------------------------------------------*/ + + /** + * Creates the sql encoder. + * + * @throws ConfigurationException + * if an error occurred while accessing the configuration + * @throws LoggingException + * if an error occurred while accessing the logger + */ + @Override + protected void init() throws ConfigurationException, LoggingException + { + + this.sqlEncoder = new TimedSQLEncoder(logger); + super.sqlEncoder = this.sqlEncoder; + } + + /** + * This method will process the given DiffTask and send him to the specified output. + * + * @param task + * DiffTask + * @throws ConfigurationException + * if problems occurred while initializing the components + * @throws IOException + * if problems occurred while writing the output (to file or archive) + * @throws SQLConsumerException + * if problems occurred while writing the output (to the sql producer database) + */ + @Override + public void process(final Task task) + throws ConfigurationException, IOException, SQLConsumerException + { + + long startTime = System.currentTimeMillis(); + + TaskTypes type = task.getTaskType(); - this.processingTimeSQL += System.currentTimeMillis() - startTime; + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_FIRST) { - if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + this.sqlEncoder.init(); + this.processingTimeSQL = 0; + } - ArticleInformation info = task.getHeader(); - info.setEncodedSize(this.sqlEncoder.getEncodedSize()); - info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); - info.setExitingTime(System.currentTimeMillis()); - info.setProcessingTimeSQL(processingTimeSQL); + super.process(task); - String successReport = info.toString(); - this.outputLogger.logMessage(Level.INFO, "\r\n" + successReport); + this.processingTimeSQL += System.currentTimeMillis() - startTime; + + if (type == TaskTypes.TASK_FULL || type == TaskTypes.TASK_PARTIAL_LAST) { + + ArticleInformation info = task.getHeader(); + info.setEncodedSize(this.sqlEncoder.getEncodedSize()); + info.setEncodedSQLSize(this.sqlEncoder.getEncodedSQLSize()); + info.setExitingTime(System.currentTimeMillis()); + info.setProcessingTimeSQL(processingTimeSQL); + + String successReport = info.toString(); + this.outputLogger.logMessage(Level.INFO, "\r\n" + successReport); + } } - } - - /** - * This method will close the connection to the output. - * - * @throws IOException if problems occurred while closing the file or process. - */ - @Override - public void close() throws IOException { - try { - super.close(); - } finally { - this.outputLogger.close(); + + /** + * This method will close the connection to the output. + * + * @throws IOException + * if problems occurred while closing the file or process. + */ + @Override + public void close() throws IOException + { + try { + super.close(); + } + finally { + this.outputLogger.close(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java index 1783d40f..14553198 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/OutputType.java @@ -20,55 +20,62 @@ /** * This Enumerator list the possible output values. */ -public enum OutputType { +public enum OutputType +{ - /** - * The output will consist of a single or multiple sql files - */ - UNCOMPRESSED, + /** + * The output will consist of a single or multiple sql files + */ + UNCOMPRESSED, - /** - * The output will consist of a single or multiple 7z archives - */ - SEVENZIP, + /** + * The output will consist of a single or multiple 7z archives + */ + SEVENZIP, - /** - * The output will consist of a single or multiple bzip2 archives - */ - BZIP2, + /** + * The output will consist of a single or multiple bzip2 archives + */ + BZIP2, - /** - * The output will consist of a single or multiple alternate archives - */ - ALTERNATE, + /** + * The output will consist of a single or multiple alternate archives + */ + ALTERNATE, - /** - * The output will be directly written into a database - */ - DATABASE; + /** + * The output will be directly written into a database + */ + DATABASE; - /** - * Parses the given string. - * - * @param s string - * @return OutputTypes - */ - public static OutputType parse(final String s) { + /** + * Parses the given string. + * + * @param s + * string + * @return OutputTypes + */ + public static OutputType parse(final String s) + { - String t = s.toUpperCase(); + String t = s.toUpperCase(); - if (t.equals("UNCOMPRESSED")) { - return OutputType.UNCOMPRESSED; - } else if (t.equals("SEVENZIP")) { - return OutputType.SEVENZIP; - } else if (t.equals("BZIP2")) { - return OutputType.BZIP2; - } else if (t.equals("DATABASE")) { - return OutputType.DATABASE; - } else if (t.equals("ALTERNATE")) { - return OutputType.ALTERNATE; - } + if (t.equals("UNCOMPRESSED")) { + return OutputType.UNCOMPRESSED; + } + else if (t.equals("SEVENZIP")) { + return OutputType.SEVENZIP; + } + else if (t.equals("BZIP2")) { + return OutputType.BZIP2; + } + else if (t.equals("DATABASE")) { + return OutputType.DATABASE; + } + else if (t.equals("ALTERNATE")) { + return OutputType.ALTERNATE; + } - throw new IllegalArgumentException("Unknown OutputType : " + s); - } + throw new IllegalArgumentException("Unknown OutputType : " + s); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java index 350ffccc..e36c4938 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/SurrogateModes.java @@ -20,60 +20,64 @@ /** * This Enumerator lists the different method of how to handle surrogates. *

- * TODO: The surrogate mode implementations need a work over. - * TODO Add documentation for surrogates + * TODO: The surrogate mode implementations need a work over. TODO Add documentation for surrogates */ -public enum SurrogateModes { +public enum SurrogateModes +{ - /** - * Replace the surrogate - * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! - */ - REPLACE, + /** + * Replace the surrogate TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! + */ + REPLACE, - /** - * Throw an error if a surrogate is detected - * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! - */ - THROW_ERROR, + /** + * Throw an error if a surrogate is detected TODO COULD BE FAULTY. CHECK BEFORE USING!!! + * DISABLED FOR NOW! + */ + THROW_ERROR, - /** - * Discard the rest of the article after a surrogate is detected - * TODO COULD BE FAULTY. CHECK BEFORE USING!!! DISABLED FOR NOW! - */ - DISCARD_REST, + /** + * Discard the rest of the article after a surrogate is detected TODO COULD BE FAULTY. CHECK + * BEFORE USING!!! DISABLED FOR NOW! + */ + DISCARD_REST, - /** - * Discard revisions which contain surrogates (java default setting) - */ - DISCARD_REVISION; + /** + * Discard revisions which contain surrogates (java default setting) + */ + DISCARD_REVISION; - /** - * Parses the given string. - * - * @param s string - * @return SurrogateModes - */ - public static SurrogateModes parse(final String s) { + /** + * Parses the given string. + * + * @param s + * string + * @return SurrogateModes + */ + public static SurrogateModes parse(final String s) + { - String t = s.toUpperCase(); + String t = s.toUpperCase(); - if (t.equals("REPLACE")) { - // return REPLACE; - throw new UnsupportedOperationException( - "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); - } else if (t.equals("THROW_ERROR")) { - // return THROW_ERROR; - throw new UnsupportedOperationException( - "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); - } else if (t.equals("DISCARD_REST")) { - // return DISCARD_REST; - throw new UnsupportedOperationException( - "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); - } else if (t.equals("DISCARD_REVISION")) { - return DISCARD_REVISION; - } + if (t.equals("REPLACE")) { + // return REPLACE; + throw new UnsupportedOperationException( + "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); + } + else if (t.equals("THROW_ERROR")) { + // return THROW_ERROR; + throw new UnsupportedOperationException( + "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); + } + else if (t.equals("DISCARD_REST")) { + // return DISCARD_REST; + throw new UnsupportedOperationException( + "This mode is currently not supported. Please check the implementation first. For now, you can use the default mode DISCARD_REVISION"); + } + else if (t.equals("DISCARD_REVISION")) { + return DISCARD_REVISION; + } - throw new IllegalArgumentException("Unknown SurrogateModes : " + s); - } + throw new IllegalArgumentException("Unknown SurrogateModes : " + s); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java index 39408902..637602f6 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveDescription.java @@ -22,76 +22,86 @@ *

* TODO: The start position is currently unused. */ -public class ArchiveDescription { +public class ArchiveDescription +{ - /** - * Path to the archive - */ - private final String path; + /** + * Path to the archive + */ + private final String path; - /** - * Start position - */ - private long startPosition; + /** + * Start position + */ + private long startPosition; - /** - * InputType - */ - private final InputType type; + /** + * InputType + */ + private final InputType type; - /** - * (Constructor) Creates a new ArchiveDescription - * - * @param type InputType - * @param path Path - */ - public ArchiveDescription(final InputType type, final String path) { - this.type = type; - this.path = path; - } + /** + * (Constructor) Creates a new ArchiveDescription + * + * @param type + * InputType + * @param path + * Path + */ + public ArchiveDescription(final InputType type, final String path) + { + this.type = type; + this.path = path; + } - /** - * Returns the path. - * - * @return path - */ - public String getPath() { - return this.path; - } + /** + * Returns the path. + * + * @return path + */ + public String getPath() + { + return this.path; + } - /** - * Returns the start position. - * - * @return start position - */ - public long getStartPosition() { - return startPosition; - } + /** + * Returns the start position. + * + * @return start position + */ + public long getStartPosition() + { + return startPosition; + } - /** - * Returns the InputType. - * - * @return InputType - */ - public InputType getType() { - return this.type; - } + /** + * Returns the InputType. + * + * @return InputType + */ + public InputType getType() + { + return this.type; + } - /** - * Sets the start position. - * - * @param startPosition start position - */ - public void setStartPosition(final long startPosition) { - this.startPosition = startPosition; - } + /** + * Sets the start position. + * + * @param startPosition + * start position + */ + public void setStartPosition(final long startPosition) + { + this.startPosition = startPosition; + } - /** - * Returns the string representation of this object. - * - * @return [InputType, path] - */ - public String toString() { - return "[" + this.getType() + ", " + this.getPath() + "]"; - } + /** + * Returns the string representation of this object. + * + * @return [InputType, path] + */ + public String toString() + { + return "[" + this.getType() + ", " + this.getPath() + "]"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java index 42aeaa42..d4755c18 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/ArchiveManager.java @@ -27,63 +27,70 @@ * * @version 0.5.0 */ -public class ArchiveManager { +public class ArchiveManager +{ - /** - * List of available archives - */ - private final List archives; + /** + * List of available archives + */ + private final List archives; - /** - * (Constructor) Creates the ArchiveManager. - * - * @throws ConfigurationException if an error occurs while accessing the configuration - */ - public ArchiveManager() throws ConfigurationException { + /** + * (Constructor) Creates the ArchiveManager. + * + * @throws ConfigurationException + * if an error occurs while accessing the configuration + */ + public ArchiveManager() throws ConfigurationException + { - ConfigurationManager config = ConfigurationManager.getInstance(); - this.archives = config.getArchiveList(); - } + ConfigurationManager config = ConfigurationManager.getInstance(); + this.archives = config.getArchiveList(); + } - /** - * Returns whether an archive is available or not. - * - * @return TRUE | FALSE - */ - public boolean hasArchive() { - return !this.archives.isEmpty(); - } + /** + * Returns whether an archive is available or not. + * + * @return TRUE | FALSE + */ + public boolean hasArchive() + { + return !this.archives.isEmpty(); + } - /** - * Returns an archive. - * - * @return ArchiveDescription or NULL if no archive is available - */ - public synchronized ArchiveDescription getArchive() { + /** + * Returns an archive. + * + * @return ArchiveDescription or NULL if no archive is available + */ + public synchronized ArchiveDescription getArchive() + { - if (!this.archives.isEmpty()) { + if (!this.archives.isEmpty()) { - return this.archives.remove(0); - } + return this.archives.remove(0); + } - return null; - } + return null; + } - /** - * Returns the number of remaining archives. - * - * @return number of available archives - */ - public int size() { - return this.archives.size(); - } + /** + * Returns the number of remaining archives. + * + * @return number of available archives + */ + public int size() + { + return this.archives.size(); + } - /** - * Returns the string representation of the ArchiveManager's content. - * - * @return [ number of archives ] - */ - public String toString() { - return "ArchiveManager:\t[" + this.size() + "]"; - } + /** + * Returns the string representation of the ArchiveManager's content. + * + * @return [ number of archives ] + */ + public String toString() + { + return "ArchiveManager:\t[" + this.size() + "]"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java index caebbb1d..7375ec99 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/archive/InputType.java @@ -20,43 +20,48 @@ /** * This class represents an enumeration of the input type. */ -public enum InputType { - - /** - * Uncompressed XML Input - */ - XML, - - /** - * SevenZip Compressed XML Input - */ - SEVENZIP, - - /** - * BZip2 Compressed XML Input - */ - BZIP2; - - /** - * Parses the string representation to the related InputType. - * - * @param s String representation of the InputType. - * @return InputType Enumerator - * @throws IllegalArgumentException if the parsed String does not match with one of the - * enumerators - */ - public static InputType parse(final String s) { - - String t = s.toUpperCase(); - - if (t.equals("XML")) { - return XML; - } else if (t.equals("SEVENZIP")) { - return SEVENZIP; - } else if (t.equals("BZIP2")) { - return BZIP2; - } +public enum InputType +{ + + /** + * Uncompressed XML Input + */ + XML, + + /** + * SevenZip Compressed XML Input + */ + SEVENZIP, + + /** + * BZip2 Compressed XML Input + */ + BZIP2; - throw new IllegalArgumentException("Unknown InputType : " + s); - } + /** + * Parses the string representation to the related InputType. + * + * @param s + * String representation of the InputType. + * @return InputType Enumerator + * @throws IllegalArgumentException + * if the parsed String does not match with one of the enumerators + */ + public static InputType parse(final String s) + { + + String t = s.toUpperCase(); + + if (t.equals("XML")) { + return XML; + } + else if (t.equals("SEVENZIP")) { + return SEVENZIP; + } + else if (t.equals("BZIP2")) { + return BZIP2; + } + + throw new IllegalArgumentException("Unknown InputType : " + s); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java index dbbc18e4..d7c0e93a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitReader.java @@ -24,124 +24,129 @@ /** * The BitReader buffers a byte-array. */ -public class BitReader { - - /** - * Current index in the byte array - */ - private int inputIndex; - - /** - * Byte input array - */ - private final byte[] input; - - /** - * Buffer used to store a single byte - */ - private int buffer; - - /** - * Length of the bits in the buffer that have not been read yet - */ - private int bufferLength; - - /** - * Constructor of the BitReader - * - * @param input byte input array - */ - public BitReader(final byte[] input) { - this.input = input; - - this.buffer = 0; - this.bufferLength = -1; - this.inputIndex = 0; - } - - /** - * Reads the next bit from the input. - * - * @return 0 or 1 - * @throws DecodingException if the decoding failed - */ - public int readBit() - throws DecodingException { - - if (bufferLength == -1) { - buffer = readByte(); - if (buffer == -1) { - return -1; - } - - bufferLength = 7; +public class BitReader +{ + + /** + * Current index in the byte array + */ + private int inputIndex; + + /** + * Byte input array + */ + private final byte[] input; + + /** + * Buffer used to store a single byte + */ + private int buffer; + + /** + * Length of the bits in the buffer that have not been read yet + */ + private int bufferLength; + + /** + * Constructor of the BitReader + * + * @param input + * byte input array + */ + public BitReader(final byte[] input) + { + this.input = input; + + this.buffer = 0; + this.bufferLength = -1; + this.inputIndex = 0; } - return (buffer >> bufferLength--) & 1; - } - - /** - * Reads the next length-bits from the input. - *

- * The maximum value of bits that could be read is 31. (Maximum value of a - * positive number that could be stored in an integer without any - * conversion.) - * - * @param length number of bits to read - * @return content as integer value or -1 if the end of the stream has been - * reached - * @throws DecodingException if the decoding failed - */ - public int read(final int length) - throws DecodingException { - - if (length > 31) { - throw ErrorFactory.createDecodingException( - ErrorKeys.DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, - "more than maximum length: " + length); + /** + * Reads the next bit from the input. + * + * @return 0 or 1 + * @throws DecodingException + * if the decoding failed + */ + public int readBit() throws DecodingException + { + + if (bufferLength == -1) { + buffer = readByte(); + if (buffer == -1) { + return -1; + } + + bufferLength = 7; + } + + return (buffer >> bufferLength--) & 1; } - int v, b = 0; - for (int i = length - 1; i >= 0; i--) { - v = readBit(); - if (v == -1) { - if (i != length - 1) { - throw ErrorFactory - .createDecodingException(ErrorKeys.DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM); + /** + * Reads the next length-bits from the input. + *

+ * The maximum value of bits that could be read is 31. (Maximum value of a positive number that + * could be stored in an integer without any conversion.) + * + * @param length + * number of bits to read + * @return content as integer value or -1 if the end of the stream has been reached + * @throws DecodingException + * if the decoding failed + */ + public int read(final int length) throws DecodingException + { + + if (length > 31) { + throw ErrorFactory.createDecodingException( + ErrorKeys.DIFFTOOL_DECODING_VALUE_OUT_OF_RANGE, + "more than maximum length: " + length); + } + + int v, b = 0; + for (int i = length - 1; i >= 0; i--) { + v = readBit(); + if (v == -1) { + if (i != length - 1) { + throw ErrorFactory.createDecodingException( + ErrorKeys.DIFFTOOL_DECODING_UNEXPECTED_END_OF_STREAM); + } + + return -1; + } + b |= v << i; } - return -1; - } - b |= v << i; + return b; } - return b; - } - - /** - * Resets the buffer. - */ - public void skip() { - this.buffer = 0; - this.bufferLength = -1; - } - - /** - * Reads the next character in the input Note: The current content of the - * buffer will be deleted. This method should only be used for reading the - * textual content of the diff-part. - * - * @return the next character in the string - * @throws DecodingException if the decoding failed - */ - public int readByte() - throws DecodingException { - - skip(); - if (input == null || inputIndex >= input.length) { - return -1; + /** + * Resets the buffer. + */ + public void skip() + { + this.buffer = 0; + this.bufferLength = -1; } - return 0xFF & input[inputIndex++]; - } + /** + * Reads the next character in the input Note: The current content of the buffer will be + * deleted. This method should only be used for reading the textual content of the diff-part. + * + * @return the next character in the string + * @throws DecodingException + * if the decoding failed + */ + public int readByte() throws DecodingException + { + + skip(); + if (input == null || inputIndex >= input.length) { + return -1; + } + + return 0xFF & input[inputIndex++]; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java index 52960051..e9bde6b4 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/BitWriter.java @@ -24,147 +24,160 @@ import org.dkpro.jwpl.revisionmachine.common.exceptions.ErrorKeys; /** - * The BitWriter buffers bit that will be written byte-by-byte to an output - * stream. + * The BitWriter buffers bit that will be written byte-by-byte to an output stream. */ -public class BitWriter { - - /** - * Output buffer - */ - private final ByteArrayOutputStream stream; - - /** - * Buffer to store the bits - */ - private int buffer; - - /** - * Number of stored bits - */ - private byte bufferLength = 0; - - /** - * Constructor Creates a BitWriter with a byte buffer of the given length. - * - * @param length Length of the byte buffer - */ - public BitWriter(final int length) { - this.stream = new ByteArrayOutputStream(length); - } - - /** - * Constructor Creates a BitWriter with a standard buffer. - */ - public BitWriter() { - this.stream = new ByteArrayOutputStream(); - } - - /** - * Writes a byte to the buffer. - * - * @param val an integer representing a full byte - * @throws EncodingException if the value is out range - */ - private void write(final int val) - throws EncodingException { - - if (val < 0 || val > 255) { - throw ErrorFactory.createEncodingException( - ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - "byte value out of range: " + val); +public class BitWriter +{ + + /** + * Output buffer + */ + private final ByteArrayOutputStream stream; + + /** + * Buffer to store the bits + */ + private int buffer; + + /** + * Number of stored bits + */ + private byte bufferLength = 0; + + /** + * Constructor Creates a BitWriter with a byte buffer of the given length. + * + * @param length + * Length of the byte buffer + */ + public BitWriter(final int length) + { + this.stream = new ByteArrayOutputStream(length); } - this.stream.write(val); - } - - /** - * Writes a single bit to the buffer. - * - * @param bit 0 or 1 - * @throws EncodingException if the input is neither 0 nor 1. - */ - public void writeBit(final int bit) - throws EncodingException { - - if (bit != 0 && bit != 1) { - throw ErrorFactory.createEncodingException( - ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - "bit value out of range: " + bit); + /** + * Constructor Creates a BitWriter with a standard buffer. + */ + public BitWriter() + { + this.stream = new ByteArrayOutputStream(); } - this.buffer |= bit << (7 - this.bufferLength); - this.bufferLength++; - - if (bufferLength == 8) { - - write(buffer); - - this.bufferLength = 0; - this.buffer = 0; + /** + * Writes a byte to the buffer. + * + * @param val + * an integer representing a full byte + * @throws EncodingException + * if the value is out range + */ + private void write(final int val) throws EncodingException + { + + if (val < 0 || val > 255) { + throw ErrorFactory.createEncodingException( + ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + "byte value out of range: " + val); + } + + this.stream.write(val); } - } - - /** - * Writes a positive integer to the buffer. - * - * @param length the number of bits to write - * @param value an integer value - * @throws EncodingException if the length of the input is more than 31 bits. - */ - public void writeValue(final int length, final int value) - throws EncodingException { - if (length > 31) { - throw ErrorFactory.createEncodingException( - ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, - "more than maximum length: " + value); + + /** + * Writes a single bit to the buffer. + * + * @param bit + * 0 or 1 + * @throws EncodingException + * if the input is neither 0 nor 1. + */ + public void writeBit(final int bit) throws EncodingException + { + + if (bit != 0 && bit != 1) { + throw ErrorFactory.createEncodingException( + ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + "bit value out of range: " + bit); + } + + this.buffer |= bit << (7 - this.bufferLength); + this.bufferLength++; + + if (bufferLength == 8) { + + write(buffer); + + this.bufferLength = 0; + this.buffer = 0; + } } - for (int i = length - 1; i >= 0; i--) { - writeBit((value >> i) & 1); + /** + * Writes a positive integer to the buffer. + * + * @param length + * the number of bits to write + * @param value + * an integer value + * @throws EncodingException + * if the length of the input is more than 31 bits. + */ + public void writeValue(final int length, final int value) throws EncodingException + { + if (length > 31) { + throw ErrorFactory.createEncodingException( + ErrorKeys.DIFFTOOL_ENCODING_VALUE_OUT_OF_RANGE, + "more than maximum length: " + value); + } + + for (int i = length - 1; i >= 0; i--) { + writeBit((value >> i) & 1); + } } - } - - /** - * Writes the byte array to the buffer. The currently used buffer will be - * filled with zero bits before it is written in front of the byte-array. - * - * @param bText byte array - * @throws EncodingException if the writing fails - */ - public void write(final byte[] bText) - throws EncodingException { - - writeFillBits(); - - int l = bText.length; - for (int i = 0; i < l; i++) { - write(0xFF & bText[i]); + + /** + * Writes the byte array to the buffer. The currently used buffer will be filled with zero bits + * before it is written in front of the byte-array. + * + * @param bText + * byte array + * @throws EncodingException + * if the writing fails + */ + public void write(final byte[] bText) throws EncodingException + { + + writeFillBits(); + + int l = bText.length; + for (int i = 0; i < l; i++) { + write(0xFF & bText[i]); + } } - } - - /** - * The currently used buffer will be filled with zero bits before it is - * written in the buffer. - * - * @throws EncodingException if the writing fails - */ - public void writeFillBits() - throws EncodingException { - - while (this.bufferLength != 0) { - writeBit(0); + + /** + * The currently used buffer will be filled with zero bits before it is written in the buffer. + * + * @throws EncodingException + * if the writing fails + */ + public void writeFillBits() throws EncodingException + { + + while (this.bufferLength != 0) { + writeBit(0); + } + + this.buffer = 0; } - this.buffer = 0; - } - - /** - * Returns the content of the buffer as byte-array. - * - * @return byte-array - */ - public byte[] toByteArray() { - return this.stream.toByteArray(); - } + /** + * Returns the content of the buffer as byte-array. + * + * @return byte-array + */ + public byte[] toByteArray() + { + return this.stream.toByteArray(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java index fd2dfd32..6c823bc8 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionCodecData.java @@ -18,239 +18,251 @@ package org.dkpro.jwpl.revisionmachine.difftool.data.codec; /** - * The RevisionCodecData class contains all necessary information to encode the - * diff information. + * The RevisionCodecData class contains all necessary information to encode the diff information. *

- * Block C 3bit operation value Block S start position Block E length (end - * position = start position + length) Block B block id Block L length of the t - * block Block T block containing L bytes data + * Block C 3bit operation value Block S start position Block E length (end position = start position + * + length) Block B block id Block L length of the t block Block T block containing L bytes data */ -public class RevisionCodecData { - - /** - * Maximum size of a S block - */ - private int blocksize_S; - - /** - * Maximum size of a E block - */ - private int blocksize_E; - - /** - * Maximum size of an B block - */ - private int blocksize_B; - - /** - * Maximum size of an L block - */ - private int blocksize_L; - - /** - * Number of C blocks - */ - private int countC; - - /** - * Number of S blocks - */ - private int countS; - - /** - * Number of E blocks - */ - private int countE; - - /** - * Number of B blocks - */ - private int countB; - - /** - * Number of L blocks - */ - private int countL; - - /** - * Number of T blocks - */ - private int countT; - - /** - * Whether the information has already been converted or not - */ - private boolean converted; - - /** - * Constructor Creates a new RevisionCodecData object. - */ - public RevisionCodecData() { - this.converted = false; - this.blocksize_S = 0; - this.blocksize_E = 0; - this.blocksize_B = 0; - this.blocksize_L = 0; - } - - /** - * Gathers the information about an s block. - * - * @param value start position - */ - public void checkBlocksizeS(final int value) { - if (value > blocksize_S) { - this.blocksize_S = value; +public class RevisionCodecData +{ + + /** + * Maximum size of a S block + */ + private int blocksize_S; + + /** + * Maximum size of a E block + */ + private int blocksize_E; + + /** + * Maximum size of an B block + */ + private int blocksize_B; + + /** + * Maximum size of an L block + */ + private int blocksize_L; + + /** + * Number of C blocks + */ + private int countC; + + /** + * Number of S blocks + */ + private int countS; + + /** + * Number of E blocks + */ + private int countE; + + /** + * Number of B blocks + */ + private int countB; + + /** + * Number of L blocks + */ + private int countL; + + /** + * Number of T blocks + */ + private int countT; + + /** + * Whether the information has already been converted or not + */ + private boolean converted; + + /** + * Constructor Creates a new RevisionCodecData object. + */ + public RevisionCodecData() + { + this.converted = false; + this.blocksize_S = 0; + this.blocksize_E = 0; + this.blocksize_B = 0; + this.blocksize_L = 0; } - this.countS++; - this.countC++; - } - - /** - * Gathers the information about an e block. - * - * @param value length of the diff-block - */ - public void checkBlocksizeE(final int value) { - if (value > blocksize_E) { - this.blocksize_E = value; + + /** + * Gathers the information about an s block. + * + * @param value + * start position + */ + public void checkBlocksizeS(final int value) + { + if (value > blocksize_S) { + this.blocksize_S = value; + } + this.countS++; + this.countC++; + } + + /** + * Gathers the information about an e block. + * + * @param value + * length of the diff-block + */ + public void checkBlocksizeE(final int value) + { + if (value > blocksize_E) { + this.blocksize_E = value; + } + this.countE++; } - this.countE++; - } - - /** - * Gathers the information about a b block. - * - * @param value block id - */ - public void checkBlocksizeB(final int value) { - if (value > blocksize_B) { - this.blocksize_B = value; + + /** + * Gathers the information about a b block. + * + * @param value + * block id + */ + public void checkBlocksizeB(final int value) + { + if (value > blocksize_B) { + this.blocksize_B = value; + } + this.countB++; } - this.countB++; - } - - /** - * Gathers the information about an l block. - * - * @param value length of the text block - */ - public void checkBlocksizeL(final int value) { - if (value > blocksize_L) { - this.blocksize_L = value; + + /** + * Gathers the information about an l block. + * + * @param value + * length of the text block + */ + public void checkBlocksizeL(final int value) + { + if (value > blocksize_L) { + this.blocksize_L = value; + } + this.countL++; + this.countT += value; } - this.countL++; - this.countT += value; - } - - /** - * Converts the input information into their log2 values. If an operation is - * contained in the diff, the minimum number of bits used to encode this - * block is 1 byte. - * - * @return number of bytes needed to encode the associated diff - */ - public int totalSizeInBits() { - - if (converted) { - - return 24 + this.countC * 3 + this.countS * blocksize_S - + this.countE * blocksize_E + this.countB * blocksize_B - + this.countL * blocksize_L + this.countT * 8; + + /** + * Converts the input information into their log2 values. If an operation is contained in the + * diff, the minimum number of bits used to encode this block is 1 byte. + * + * @return number of bytes needed to encode the associated diff + */ + public int totalSizeInBits() + { + + if (converted) { + + return 24 + this.countC * 3 + this.countS * blocksize_S + this.countE * blocksize_E + + this.countB * blocksize_B + this.countL * blocksize_L + this.countT * 8; + } + + converted = true; + // System.out.println(this.toString()); + + if (this.blocksize_B > 0) { + this.blocksize_B = (int) Math.ceil(Math.log(blocksize_B + 1) / Math.log(2.)); + } + else if (this.countB > 0) { + this.blocksize_B = 1; + } + + if (this.blocksize_E > 0) { + this.blocksize_E = (int) Math.ceil(Math.log(blocksize_E + 1) / Math.log(2.)); + } + else if (this.countE > 0) { + this.blocksize_E = 1; + } + + if (this.blocksize_L > 0) { + this.blocksize_L = (int) Math.ceil(Math.log(blocksize_L + 1) / Math.log(2.)); + } + else if (this.countL > 0) { + this.blocksize_L = 1; + } + + if (this.blocksize_S > 0) { + this.blocksize_S = (int) Math.ceil(Math.log(blocksize_S + 1) / Math.log(2.)); + } + else if (this.countS > 0) { + this.blocksize_S = 1; + } + + return 24 + this.countC * 3 + this.countS * blocksize_S + this.countE * blocksize_E + + this.countB * blocksize_B + this.countL * blocksize_L + this.countT * 8; } - converted = true; - // System.out.println(this.toString()); + /** + * Returns the number of bits used to encode a B block. This method is intended to be used after + * the conversion. + * + * @return block bit-length + */ + public int getBlocksizeB() + { + return this.blocksize_B; + } - if (this.blocksize_B > 0) { - this.blocksize_B = (int) Math.ceil(Math.log(blocksize_B + 1) - / Math.log(2.)); - } else if (this.countB > 0) { - this.blocksize_B = 1; + /** + * Returns the number of bits used to encode a E block. This method is intended to be used after + * the conversion. + * + * @return block bit-length + */ + public int getBlocksizeE() + { + return this.blocksize_E; } - if (this.blocksize_E > 0) { - this.blocksize_E = (int) Math.ceil(Math.log(blocksize_E + 1) - / Math.log(2.)); - } else if (this.countE > 0) { - this.blocksize_E = 1; + /** + * Returns the number of bits used to encode a L block. This method is intended to be used after + * the conversion. + * + * @return block bit-length + */ + public int getBlocksizeL() + { + return this.blocksize_L; } - if (this.blocksize_L > 0) { - this.blocksize_L = (int) Math.ceil(Math.log(blocksize_L + 1) - / Math.log(2.)); - } else if (this.countL > 0) { - this.blocksize_L = 1; + /** + * Returns the number of bits used to encode a S block. This method is intended to be used after + * the conversion. + * + * @return block bit-length + */ + public int getBlocksizeS() + { + return this.blocksize_S; } - if (this.blocksize_S > 0) { - this.blocksize_S = (int) Math.ceil(Math.log(blocksize_S + 1) - / Math.log(2.)); - } else if (this.countS > 0) { - this.blocksize_S = 1; + /** + * String representation of the revision codec data. + * + * @return string representation + */ + public String toString() + { + return this.blocksize_S + " " + this.blocksize_E + " " + this.blocksize_B + " " + + this.blocksize_L; } - return 24 + this.countC * 3 + this.countS * blocksize_S + this.countE - * blocksize_E + this.countB * blocksize_B + this.countL - * blocksize_L + this.countT * 8; - } - - /** - * Returns the number of bits used to encode a B block. This method is - * intended to be used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeB() { - return this.blocksize_B; - } - - /** - * Returns the number of bits used to encode a E block. This method is - * intended to be used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeE() { - return this.blocksize_E; - } - - /** - * Returns the number of bits used to encode a L block. This method is - * intended to be used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeL() { - return this.blocksize_L; - } - - /** - * Returns the number of bits used to encode a S block. This method is - * intended to be used after the conversion. - * - * @return block bit-length - */ - public int getBlocksizeS() { - return this.blocksize_S; - } - - /** - * String representation of the revision codec data. - * - * @return string representation - */ - public String toString() { - return this.blocksize_S + " " + this.blocksize_E + " " - + this.blocksize_B + " " + this.blocksize_L; - } - - /** - * Whether the information has already converted to the log2 basis or not. - * - * @return conversion information - */ - public boolean isConverted() { - return this.converted; - } + /** + * Whether the information has already converted to the log2 basis or not. + * + * @return conversion information + */ + public boolean isConverted() + { + return this.converted; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java index d395201d..eaedd171 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionDecoder.java @@ -34,485 +34,532 @@ import org.dkpro.jwpl.revisionmachine.difftool.data.tasks.content.DiffPart; /** - * The RevisionDecoder class contains methods to decode an encoded diff - * information. + * The RevisionDecoder class contains methods to decode an encoded diff information. */ -public class RevisionDecoder { - - /** - * Reference to the BitReader - */ - private BitReader r; - - /** - * Configuration Parameter - Wikipedia Encoding - */ - private final String WIKIPEDIA_ENCODING; - - /** - * Creates a new RevisionDecoder object. - * - * @throws ConfigurationException if an error occurs while accessing the configuration - * parameters - */ - private RevisionDecoder() throws ConfigurationException { - - // Load config parameters - ConfigurationManager config = ConfigurationManager.getInstance(); - - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - } - - /** - * Creates a new RevisionDecoder object. - * - * @param wikipediaEncoding Character encoding - */ - public RevisionDecoder(final String wikipediaEncoding) { - - WIKIPEDIA_ENCODING = wikipediaEncoding; - } - - /** - * Creates a new RevisionDecoder object. - * - * @param input binary encoded diff - * @throws ConfigurationException if an error occurs while accessing the configuration - * parameters - */ - public RevisionDecoder(final byte[] input) - throws ConfigurationException { - - this(); - if (input[0] == -128) { - r = new BitReader(inflateInput(input, 1)); - } else { - r = new BitReader(input); +public class RevisionDecoder +{ + + /** + * Reference to the BitReader + */ + private BitReader r; + + /** + * Configuration Parameter - Wikipedia Encoding + */ + private final String WIKIPEDIA_ENCODING; + + /** + * Creates a new RevisionDecoder object. + * + * @throws ConfigurationException + * if an error occurs while accessing the configuration parameters + */ + private RevisionDecoder() throws ConfigurationException + { + + // Load config parameters + ConfigurationManager config = ConfigurationManager.getInstance(); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); } - } - /** - * Decodes the information and returns the Diff. - * - * @return Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - */ - public Diff decode() throws UnsupportedEncodingException, DecodingException { + /** + * Creates a new RevisionDecoder object. + * + * @param wikipediaEncoding + * Character encoding + */ + public RevisionDecoder(final String wikipediaEncoding) + { - int header = r.read(3); - if (DiffAction.parse(header) != DiffAction.DECODER_DATA) { + WIKIPEDIA_ENCODING = wikipediaEncoding; + } - throw new DecodingException("Invalid codecData code: " + header); + /** + * Creates a new RevisionDecoder object. + * + * @param input + * binary encoded diff + * @throws ConfigurationException + * if an error occurs while accessing the configuration parameters + */ + public RevisionDecoder(final byte[] input) throws ConfigurationException + { + + this(); + if (input[0] == -128) { + r = new BitReader(inflateInput(input, 1)); + } + else { + r = new BitReader(input); + } } - int blockSize_C = 3; - int blockSize_S = r.read(5); - int blockSize_E = r.read(5); - int blockSize_B = r.read(5); - int blockSize_L = r.read(5); - r.read(1); + /** + * Decodes the information and returns the Diff. + * + * @return Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + */ + public Diff decode() throws UnsupportedEncodingException, DecodingException + { + + int header = r.read(3); + if (DiffAction.parse(header) != DiffAction.DECODER_DATA) { + + throw new DecodingException("Invalid codecData code: " + header); + } - if (blockSize_S < 0 || blockSize_S > 31) { - throw new DecodingException("blockSize_S out of range: " - + blockSize_S); - } - if (blockSize_E < 0 || blockSize_E > 31) { - throw new DecodingException("blockSize_E out of range: " - + blockSize_E); - } - if (blockSize_B < 0 || blockSize_B > 31) { - throw new DecodingException("blockSize_B out of range: " - + blockSize_B); - } - if (blockSize_L < 0 || blockSize_L > 31) { - throw new DecodingException("blockSize_L out of range: " - + blockSize_L); - } + int blockSize_C = 3; + int blockSize_S = r.read(5); + int blockSize_E = r.read(5); + int blockSize_B = r.read(5); + int blockSize_L = r.read(5); + r.read(1); - return decode(blockSize_C, blockSize_S, blockSize_E, blockSize_B, - blockSize_L); - } - - /** - * Decodes the information, after the codec was successfully decoded, and - * returns the Diff. - * - * @param blockSize_C length of a C block - * @param blockSize_S length of a S block - * @param blockSize_E length of a E block - * @param blockSize_B length of a B block - * @param blockSize_L length of a L block - * @return Diff - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - */ - private Diff decode(final int blockSize_C, final int blockSize_S, - final int blockSize_E, final int blockSize_B, final int blockSize_L) - throws UnsupportedEncodingException, DecodingException { - - int code = r.read(blockSize_C); - Diff diff = new Diff(); - - while (code != -1) { - // System.out.print(code + "\t"); - - switch (DiffAction.parse(code)) { - case FULL_REVISION_UNCOMPRESSED: - diff.add(decodeFullRevision(blockSize_L)); - break; - case INSERT: - diff.add(decodeAdd(blockSize_S, blockSize_L)); - break; - case DELETE: - diff.add(decodeDelete(blockSize_S, blockSize_E)); - break; - case REPLACE: - diff.add(decodeReplace(blockSize_S, blockSize_E, blockSize_L)); - break; - case CUT: - diff.add(decodeCut(blockSize_S, blockSize_E, blockSize_B)); - break; - case PASTE: - diff.add(decodePaste(blockSize_S, blockSize_B, r)); - break; - default: - throw new DecodingException("Invalid block_c code: " + code); - } - - // System.out.println(); - code = r.read(blockSize_C); - } + if (blockSize_S < 0 || blockSize_S > 31) { + throw new DecodingException("blockSize_S out of range: " + blockSize_S); + } + if (blockSize_E < 0 || blockSize_E > 31) { + throw new DecodingException("blockSize_E out of range: " + blockSize_E); + } + if (blockSize_B < 0 || blockSize_B > 31) { + throw new DecodingException("blockSize_B out of range: " + blockSize_B); + } + if (blockSize_L < 0 || blockSize_L > 31) { + throw new DecodingException("blockSize_L out of range: " + blockSize_L); + } - return diff; - } - - /** - * Decodes an Add operation. - * - * @param blockSize_S length of a S block - * @param blockSize_L length of a L block - * @return DiffPart, Add operation - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - */ - private DiffPart decodeAdd(final int blockSize_S, final int blockSize_L) - throws UnsupportedEncodingException, DecodingException { - - if (blockSize_S < 1 || blockSize_L < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + " or blockSize_L: " + blockSize_L); + return decode(blockSize_C, blockSize_S, blockSize_E, blockSize_B, blockSize_L); } - int s = r.read(blockSize_S); - int l = r.read(blockSize_L); + /** + * Decodes the information, after the codec was successfully decoded, and returns the Diff. + * + * @param blockSize_C + * length of a C block + * @param blockSize_S + * length of a S block + * @param blockSize_E + * length of a E block + * @param blockSize_B + * length of a B block + * @param blockSize_L + * length of a L block + * @return Diff + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + */ + private Diff decode(final int blockSize_C, final int blockSize_S, final int blockSize_E, + final int blockSize_B, final int blockSize_L) + throws UnsupportedEncodingException, DecodingException + { + + int code = r.read(blockSize_C); + Diff diff = new Diff(); + + while (code != -1) { + // System.out.print(code + "\t"); + + switch (DiffAction.parse(code)) { + case FULL_REVISION_UNCOMPRESSED: + diff.add(decodeFullRevision(blockSize_L)); + break; + case INSERT: + diff.add(decodeAdd(blockSize_S, blockSize_L)); + break; + case DELETE: + diff.add(decodeDelete(blockSize_S, blockSize_E)); + break; + case REPLACE: + diff.add(decodeReplace(blockSize_S, blockSize_E, blockSize_L)); + break; + case CUT: + diff.add(decodeCut(blockSize_S, blockSize_E, blockSize_B)); + break; + case PASTE: + diff.add(decodePaste(blockSize_S, blockSize_B, r)); + break; + default: + throw new DecodingException("Invalid block_c code: " + code); + } + + // System.out.println(); + code = r.read(blockSize_C); + } - ByteArrayOutputStream output = new ByteArrayOutputStream(); - for (int i = 0; i < l; i++) { - output.write(r.readByte()); + return diff; } - DiffPart part = new DiffPart(DiffAction.INSERT); - part.setStart(s); - part.setText(output.toString(WIKIPEDIA_ENCODING)); - - return part; - } - - /** - * Decodes a Cut operation. - * - * @param blockSize_S length of a S block - * @param blockSize_E length of a E block - * @param blockSize_B length of a B block - * @return DiffPart, Cut operation - * @throws DecodingException if the decoding failed - */ - private DiffPart decodeCut(final int blockSize_S, final int blockSize_E, - final int blockSize_B) - throws DecodingException { - - if (blockSize_S < 1 || blockSize_E < 1 || blockSize_B < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + ", blockSize_E: " + blockSize_E - + " or blockSize_B: " + blockSize_B); - } + /** + * Decodes an Add operation. + * + * @param blockSize_S + * length of a S block + * @param blockSize_L + * length of a L block + * @return DiffPart, Add operation + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + */ + private DiffPart decodeAdd(final int blockSize_S, final int blockSize_L) + throws UnsupportedEncodingException, DecodingException + { + + if (blockSize_S < 1 || blockSize_L < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + + " or blockSize_L: " + blockSize_L); + } - int s = r.read(blockSize_S); - int e = r.read(blockSize_E); - int b = r.read(blockSize_B); - - DiffPart part = new DiffPart(DiffAction.CUT); - part.setStart(s); - part.setLength(e); - part.setText(Integer.toString(b)); - - r.skip(); - - return part; - } - - /** - * Decodes a Delete operation. - * - * @param blockSize_S length of a S block - * @param blockSize_E length of a E block - * @return DiffPart, Delete operation - * @throws DecodingException if the decoding failed - */ - private DiffPart decodeDelete(final int blockSize_S, final int blockSize_E) - throws DecodingException { - - if (blockSize_S < 1 || blockSize_E < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + " or blockSize_E: " + blockSize_E); - } + int s = r.read(blockSize_S); + int l = r.read(blockSize_L); - int s = r.read(blockSize_S); - int e = r.read(blockSize_E); + ByteArrayOutputStream output = new ByteArrayOutputStream(); + for (int i = 0; i < l; i++) { + output.write(r.readByte()); + } - DiffPart part = new DiffPart(DiffAction.DELETE); - part.setStart(s); - part.setLength(e); + DiffPart part = new DiffPart(DiffAction.INSERT); + part.setStart(s); + part.setText(output.toString(WIKIPEDIA_ENCODING)); - r.skip(); + return part; + } - return part; - } + /** + * Decodes a Cut operation. + * + * @param blockSize_S + * length of a S block + * @param blockSize_E + * length of a E block + * @param blockSize_B + * length of a B block + * @return DiffPart, Cut operation + * @throws DecodingException + * if the decoding failed + */ + private DiffPart decodeCut(final int blockSize_S, final int blockSize_E, final int blockSize_B) + throws DecodingException + { + + if (blockSize_S < 1 || blockSize_E < 1 || blockSize_B < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + + ", blockSize_E: " + blockSize_E + " or blockSize_B: " + blockSize_B); + } - /** - * Decodes a FullRevision operation. - * - * @param blockSize_L length of a L block - * @return DiffPart, FullRevision - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - */ - private DiffPart decodeFullRevision(final int blockSize_L) - throws UnsupportedEncodingException, DecodingException { + int s = r.read(blockSize_S); + int e = r.read(blockSize_E); + int b = r.read(blockSize_B); - if (blockSize_L < 1) { - throw new DecodingException("Invalid value for blockSize_L: " - + blockSize_L); - } + DiffPart part = new DiffPart(DiffAction.CUT); + part.setStart(s); + part.setLength(e); + part.setText(Integer.toString(b)); - int l = r.read(blockSize_L); + r.skip(); - ByteArrayOutputStream output = new ByteArrayOutputStream(); - for (int i = 0; i < l; i++) { - output.write(r.readByte()); + return part; } - DiffPart part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); - part.setText(output.toString(WIKIPEDIA_ENCODING)); - - return part; - } - - /** - * Decodes a Paste operation. - * - * @param blockSize_S length of a S block - * @param blockSize_B length of a B block - * @return DiffPart, Paste operation - * @throws DecodingException if the decoding failed - */ - private DiffPart decodePaste(final int blockSize_S, final int blockSize_B, - final BitReader r) - throws DecodingException { - - if (blockSize_S < 1 || blockSize_B < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + " or blockSize_B: " + blockSize_B); + + /** + * Decodes a Delete operation. + * + * @param blockSize_S + * length of a S block + * @param blockSize_E + * length of a E block + * @return DiffPart, Delete operation + * @throws DecodingException + * if the decoding failed + */ + private DiffPart decodeDelete(final int blockSize_S, final int blockSize_E) + throws DecodingException + { + + if (blockSize_S < 1 || blockSize_E < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + + " or blockSize_E: " + blockSize_E); + } + + int s = r.read(blockSize_S); + int e = r.read(blockSize_E); + + DiffPart part = new DiffPart(DiffAction.DELETE); + part.setStart(s); + part.setLength(e); + + r.skip(); + + return part; } - int s = r.read(blockSize_S); - int b = r.read(blockSize_B); - - DiffPart part = new DiffPart(DiffAction.PASTE); - part.setStart(s); - part.setText(Integer.toString(b)); - - r.skip(); - - return part; - } - - /** - * Decodes a Replace operation. - * - * @param blockSize_S length of a S block - * @param blockSize_E length of a E block - * @param blockSize_L length of a L block - * @return DiffPart, Replace operation - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws DecodingException if the decoding failed - */ - private DiffPart decodeReplace(final int blockSize_S, - final int blockSize_E, final int blockSize_L) - throws UnsupportedEncodingException, DecodingException { - - if (blockSize_S < 1 || blockSize_E < 1 || blockSize_L < 1) { - throw new DecodingException("Invalid value for blockSize_S: " - + blockSize_S + ", blockSize_E: " + blockSize_E - + " or blockSize_L: " + blockSize_L); + /** + * Decodes a FullRevision operation. + * + * @param blockSize_L + * length of a L block + * @return DiffPart, FullRevision + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + */ + private DiffPart decodeFullRevision(final int blockSize_L) + throws UnsupportedEncodingException, DecodingException + { + + if (blockSize_L < 1) { + throw new DecodingException("Invalid value for blockSize_L: " + blockSize_L); + } + + int l = r.read(blockSize_L); + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + for (int i = 0; i < l; i++) { + output.write(r.readByte()); + } + DiffPart part = new DiffPart(DiffAction.FULL_REVISION_UNCOMPRESSED); + part.setText(output.toString(WIKIPEDIA_ENCODING)); + + return part; } - int s = r.read(blockSize_S); - int e = r.read(blockSize_E); - int l = r.read(blockSize_L); + /** + * Decodes a Paste operation. + * + * @param blockSize_S + * length of a S block + * @param blockSize_B + * length of a B block + * @return DiffPart, Paste operation + * @throws DecodingException + * if the decoding failed + */ + private DiffPart decodePaste(final int blockSize_S, final int blockSize_B, final BitReader r) + throws DecodingException + { + + if (blockSize_S < 1 || blockSize_B < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + + " or blockSize_B: " + blockSize_B); + } + + int s = r.read(blockSize_S); + int b = r.read(blockSize_B); + + DiffPart part = new DiffPart(DiffAction.PASTE); + part.setStart(s); + part.setText(Integer.toString(b)); + + r.skip(); - ByteArrayOutputStream output = new ByteArrayOutputStream(); - for (int i = 0; i < l; i++) { - output.write(r.readByte()); + return part; } - DiffPart part = new DiffPart(DiffAction.REPLACE); - part.setStart(s); - part.setLength(e); - part.setText(output.toString(WIKIPEDIA_ENCODING)); - - return part; - } - - /** - * Inflates the zipped input. - * - * @param zipinput zipped input - * @param start start position - * @return inflated input - */ - private byte[] inflateInput(final byte[] zipinput, final int start) { - ByteArrayOutputStream stream; - try { - byte[] compressedInput = zipinput; - Inflater decompresser = new Inflater(); - decompresser.setInput(compressedInput, start, - compressedInput.length - start); - - byte[] output = new byte[1000]; - stream = new ByteArrayOutputStream(); - - int cLength; - do { - cLength = decompresser.inflate(output); - stream.write(output, 0, cLength); - } - while (cLength == 1000); - - } catch (DataFormatException e) { - throw new RuntimeException(e); + /** + * Decodes a Replace operation. + * + * @param blockSize_S + * length of a S block + * @param blockSize_E + * length of a E block + * @param blockSize_L + * length of a L block + * @return DiffPart, Replace operation + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws DecodingException + * if the decoding failed + */ + private DiffPart decodeReplace(final int blockSize_S, final int blockSize_E, + final int blockSize_L) + throws UnsupportedEncodingException, DecodingException + { + + if (blockSize_S < 1 || blockSize_E < 1 || blockSize_L < 1) { + throw new DecodingException("Invalid value for blockSize_S: " + blockSize_S + + ", blockSize_E: " + blockSize_E + " or blockSize_L: " + blockSize_L); + } + + int s = r.read(blockSize_S); + int e = r.read(blockSize_E); + int l = r.read(blockSize_L); + + ByteArrayOutputStream output = new ByteArrayOutputStream(); + for (int i = 0; i < l; i++) { + output.write(r.readByte()); + } + + DiffPart part = new DiffPart(DiffAction.REPLACE); + part.setStart(s); + part.setLength(e); + part.setText(output.toString(WIKIPEDIA_ENCODING)); + + return part; } - return stream.toByteArray(); - } + /** + * Inflates the zipped input. + * + * @param zipinput + * zipped input + * @param start + * start position + * @return inflated input + */ + private byte[] inflateInput(final byte[] zipinput, final int start) + { + ByteArrayOutputStream stream; + try { + byte[] compressedInput = zipinput; + Inflater decompresser = new Inflater(); + decompresser.setInput(compressedInput, start, compressedInput.length - start); + + byte[] output = new byte[1000]; + stream = new ByteArrayOutputStream(); + + int cLength; + do { + cLength = decompresser.inflate(output); + stream.write(output, 0, cLength); + } + while (cLength == 1000); + + } + catch (DataFormatException e) { + throw new RuntimeException(e); + } - /** - * Assigns the binary input. - * - * @param input binary encoded diff - */ - public void setInput(final byte[] input) { + return stream.toByteArray(); + } - if (input[0] == -128) { - r = new BitReader(inflateInput(input, 1)); - } else { - r = new BitReader(input); + /** + * Assigns the binary input. + * + * @param input + * binary encoded diff + */ + public void setInput(final byte[] input) + { + + if (input[0] == -128) { + r = new BitReader(inflateInput(input, 1)); + } + else { + r = new BitReader(input); + } } - } - - /** - * Assigns an input stream. - * - * @param input Reference to an input stream - * @param binary flag, whether the data is binary or not - * @throws IOException if an error occurs while reading the stream - */ - public void setInput(final InputStream input, final boolean binary) - throws IOException { - - if (!binary) { - - int v = input.read(); - StringBuilder buffer = new StringBuilder(); - - // Check for the no-zip flag - boolean zipFlag = (char) v == '_'; - if (zipFlag) { - v = input.read(); - } - - while (v != -1) { - buffer.append((char) v); - v = input.read(); - } - - Base64.Decoder decoder = Base64.getDecoder(); - - if (zipFlag) { - r = new BitReader(inflateInput( - decoder.decode(buffer.toString()), 0)); - } else { - r = new BitReader(decoder.decode(buffer.toString())); - } - } else { - - ByteArrayOutputStream stream = new ByteArrayOutputStream(); - - byte[] bData; - int l = input.available(); - while (l != 0) { - - bData = new byte[l]; - - if (input.read(bData) != l) { - throw new RuntimeException("ILLEGAL NUMBER OF BYTES READ"); + + /** + * Assigns an input stream. + * + * @param input + * Reference to an input stream + * @param binary + * flag, whether the data is binary or not + * @throws IOException + * if an error occurs while reading the stream + */ + public void setInput(final InputStream input, final boolean binary) throws IOException + { + + if (!binary) { + + int v = input.read(); + StringBuilder buffer = new StringBuilder(); + + // Check for the no-zip flag + boolean zipFlag = (char) v == '_'; + if (zipFlag) { + v = input.read(); + } + + while (v != -1) { + buffer.append((char) v); + v = input.read(); + } + + Base64.Decoder decoder = Base64.getDecoder(); + + if (zipFlag) { + r = new BitReader(inflateInput(decoder.decode(buffer.toString()), 0)); + } + else { + r = new BitReader(decoder.decode(buffer.toString())); + } } - stream.write(bData); + else { + + ByteArrayOutputStream stream = new ByteArrayOutputStream(); - l = input.available(); - } + byte[] bData; + int l = input.available(); + while (l != 0) { - if (input.read() != -1) { - throw new RuntimeException("END OF STREAM NOT REACHED"); - } + bData = new byte[l]; - bData = stream.toByteArray(); + if (input.read(bData) != l) { + throw new RuntimeException("ILLEGAL NUMBER OF BYTES READ"); + } + stream.write(bData); - boolean zipFlag = bData[0] == -128; + l = input.available(); + } - if (zipFlag) { - r = new BitReader(inflateInput(bData, 1)); - } else { - r = new BitReader(bData); - } + if (input.read() != -1) { + throw new RuntimeException("END OF STREAM NOT REACHED"); + } + + bData = stream.toByteArray(); + + boolean zipFlag = bData[0] == -128; + + if (zipFlag) { + r = new BitReader(inflateInput(bData, 1)); + } + else { + r = new BitReader(bData); + } + } } - } - - /** - * Assigns base 64 encoded input. - * - * @param input base 64 encoded diff - * @throws DecodingException if the decoding fails - */ - public void setInput(final String input) throws DecodingException { - - boolean zipFlag = input.charAt(0) == '_'; - Base64.Decoder decoder = Base64.getDecoder(); - if (zipFlag) { - r = new BitReader(inflateInput( - decoder.decode(input.substring(1)), 0)); - } else { - byte[] data = decoder.decode(input); - if (data == null) { - - for (int i = 0; i < input.length(); i++) { - System.err.println(i + ": " + (int) input.charAt(i) - + " <> " + input.charAt(i)); + + /** + * Assigns base 64 encoded input. + * + * @param input + * base 64 encoded diff + * @throws DecodingException + * if the decoding fails + */ + public void setInput(final String input) throws DecodingException + { + + boolean zipFlag = input.charAt(0) == '_'; + Base64.Decoder decoder = Base64.getDecoder(); + if (zipFlag) { + r = new BitReader(inflateInput(decoder.decode(input.substring(1)), 0)); } + else { + byte[] data = decoder.decode(input); + if (data == null) { + + for (int i = 0; i < input.length(); i++) { + System.err.println(i + ": " + (int) input.charAt(i) + " <> " + input.charAt(i)); + } - throw new DecodingException("BASE 64 DECODING FAILED: " + input); - } - r = new BitReader(data); + throw new DecodingException("BASE 64 DECODING FAILED: " + input); + } + r = new BitReader(data); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java index 73840cca..c9191eca 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoder.java @@ -33,340 +33,377 @@ /** * The RevisionEncoder class contains methods to encode the diff information. */ -public class RevisionEncoder implements RevisionEncoderInterface { - - /** - * Reference to the codec - */ - private RevisionCodecData codecData; - - /** - * Reference to the BitWriter - */ - private BitWriter data; - - /** - * Configuration Parameter - Zip Compression - */ - private final boolean MODE_ZIP_COMPRESSION; - - /** - * Configuration Parameter - Wikipedia Encoding - */ - private final String WIKIPEDIA_ENCODING; - - /** - * (Constructor) Creates a new RevisionEnocder object. - * - * @throws ConfigurationException if an error occurs while accessing the configuration - * parameters - */ - public RevisionEncoder() throws ConfigurationException { - - ConfigurationManager config = ConfigurationManager.getInstance(); - - WIKIPEDIA_ENCODING = (String) config.getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); - MODE_ZIP_COMPRESSION = (Boolean) config.getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); - } - - @Override - public byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException { - - byte[] bData = encode(codecData, diff); - if (MODE_ZIP_COMPRESSION) { - - Deflater compresser = new Deflater(); - compresser.setInput(bData); - compresser.finish(); - - byte[] output = new byte[1000]; - ByteArrayOutputStream stream = new ByteArrayOutputStream(); - - int cLength; - do { - cLength = compresser.deflate(output); - stream.write(output, 0, cLength); - } - while (cLength == 1000); - - output = stream.toByteArray(); - if (bData.length + 1 < output.length) { +public class RevisionEncoder + implements RevisionEncoderInterface +{ + + /** + * Reference to the codec + */ + private RevisionCodecData codecData; + + /** + * Reference to the BitWriter + */ + private BitWriter data; + + /** + * Configuration Parameter - Zip Compression + */ + private final boolean MODE_ZIP_COMPRESSION; + + /** + * Configuration Parameter - Wikipedia Encoding + */ + private final String WIKIPEDIA_ENCODING; + + /** + * (Constructor) Creates a new RevisionEnocder object. + * + * @throws ConfigurationException + * if an error occurs while accessing the configuration parameters + */ + public RevisionEncoder() throws ConfigurationException + { + + ConfigurationManager config = ConfigurationManager.getInstance(); + + WIKIPEDIA_ENCODING = (String) config + .getConfigParameter(ConfigurationKeys.WIKIPEDIA_ENCODING); + MODE_ZIP_COMPRESSION = (Boolean) config + .getConfigParameter(ConfigurationKeys.MODE_ZIP_COMPRESSION_ENABLED); + } + + @Override + public byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException + { + + byte[] bData = encode(codecData, diff); + if (MODE_ZIP_COMPRESSION) { + + Deflater compresser = new Deflater(); + compresser.setInput(bData); + compresser.finish(); + + byte[] output = new byte[1000]; + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + + int cLength; + do { + cLength = compresser.deflate(output); + stream.write(output, 0, cLength); + } + while (cLength == 1000); + + output = stream.toByteArray(); + if (bData.length + 1 < output.length) { + return bData; + } + else { + + stream = new ByteArrayOutputStream(); + stream.write(new byte[] { -128 }, 0, 1); + stream.write(output, 0, output.length); + + return stream.toByteArray(); + } + } + return bData; - } else { + } + + /** + * Creates the binary encoding of the diff while using the codec information. + * + * @param codecData + * codec + * @param diff + * diff + * @return binary data + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws EncodingException + * if the encoding failed + */ + private byte[] encode(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException + { + + this.data = new BitWriter(codecData.totalSizeInBits()); + encodeCodecData(codecData); + + DiffPart part; + + Iterator partIt = diff.iterator(); + while (partIt.hasNext()) { + part = partIt.next(); + + switch (part.getAction()) { + case FULL_REVISION_UNCOMPRESSED: + encodeFullRevisionUncompressed(part); + break; + case INSERT: + encodeInsert(part); + break; + case DELETE: + encodeDelete(part); + break; + case REPLACE: + encodeReplace(part); + break; + case CUT: + encodeCut(part); + break; + case PASTE: + encodePaste(part); + break; + /* + * case FULL_REVISION_COMPRESSED: encodeFullRevisionCompressed(part); break; + */ + default: + throw new RuntimeException(); + } + } + + return data.toByteArray(); + } + + /** + * Encodes the codecData. + * + * @param codecData + * Reference to the codec + * @throws EncodingException + * if the encoding failed + */ + private void encodeCodecData(final RevisionCodecData codecData) throws EncodingException + { + + this.codecData = codecData; + + // C + data.writeBit(0); + data.writeBit(0); + data.writeBit(0); + + // BLOCK SIZES - S E B L + this.data.writeValue(5, codecData.getBlocksizeS()); + this.data.writeValue(5, codecData.getBlocksizeE()); + this.data.writeValue(5, codecData.getBlocksizeB()); + this.data.writeValue(5, codecData.getBlocksizeL()); + + // 1 Bit + data.writeFillBits(); + } + + /** + * Encodes a Cut operation. + * + * @param part + * Reference to the Cut operation + * @throws EncodingException + * if the encoding failed + */ + private void encodeCut(final DiffPart part) throws EncodingException + { + + // C + data.writeBit(1); + data.writeBit(0); + data.writeBit(1); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // E + data.writeValue(codecData.getBlocksizeE(), part.getLength()); + + // B + data.writeValue(codecData.getBlocksizeB(), Integer.parseInt(part.getText())); + + data.writeFillBits(); - stream = new ByteArrayOutputStream(); - stream.write(new byte[]{-128}, 0, 1); - stream.write(output, 0, output.length); + } + + /** + * Encodes a Delete operation. + * + * @param part + * Reference to the Delete operation + * @throws EncodingException + * if the encoding failed + */ + private void encodeDelete(final DiffPart part) throws EncodingException + { + + // C + data.writeBit(0); + data.writeBit(1); + data.writeBit(1); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // E + data.writeValue(codecData.getBlocksizeE(), part.getLength()); + + data.writeFillBits(); + } - return stream.toByteArray(); - } + @Override + public String encodeDiff(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException + { + + String sEncoding; + byte[] bData = encode(codecData, diff); + Base64.Encoder encoder = Base64.getEncoder(); + if (MODE_ZIP_COMPRESSION) { + + Deflater compresser = new Deflater(); + compresser.setInput(bData); + compresser.finish(); + + byte[] output = new byte[1000]; + ByteArrayOutputStream stream = new ByteArrayOutputStream(); + + int cLength; + do { + cLength = compresser.deflate(output); + stream.write(output, 0, cLength); + } + while (cLength == 1000); + + output = stream.toByteArray(); + + if (bData.length + 1 < output.length) { + sEncoding = encoder.encodeToString(bData); + } + else { + sEncoding = "_" + encoder.encodeToString(output); + } + } + else { + sEncoding = encoder.encodeToString(bData); + } + + return sEncoding; } - return bData; - } - - /** - * Creates the binary encoding of the diff while using the codec - * information. - * - * @param codecData codec - * @param diff diff - * @return binary data - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws EncodingException if the encoding failed - */ - private byte[] encode(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException { - - this.data = new BitWriter(codecData.totalSizeInBits()); - encodeCodecData(codecData); - - DiffPart part; - - Iterator partIt = diff.iterator(); - while (partIt.hasNext()) { - part = partIt.next(); - - switch (part.getAction()) { - case FULL_REVISION_UNCOMPRESSED: - encodeFullRevisionUncompressed(part); - break; - case INSERT: - encodeInsert(part); - break; - case DELETE: - encodeDelete(part); - break; - case REPLACE: - encodeReplace(part); - break; - case CUT: - encodeCut(part); - break; - case PASTE: - encodePaste(part); - break; - /* - * case FULL_REVISION_COMPRESSED: - * encodeFullRevisionCompressed(part); break; - */ - default: - throw new RuntimeException(); - } + /** + * Encodes a FullRevision operation. + * + * @param part + * Reference to the FullRevision operation + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws EncodingException + * if the encoding failed + */ + private void encodeFullRevisionUncompressed(final DiffPart part) + throws UnsupportedEncodingException, EncodingException + { + + // C + data.writeBit(0); + data.writeBit(0); + data.writeBit(1); + + // L T + String text = part.getText(); + byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); + + data.writeValue(codecData.getBlocksizeL(), bText.length); + data.write(bText); + } - return data.toByteArray(); - } - - /** - * Encodes the codecData. - * - * @param codecData Reference to the codec - * @throws EncodingException if the encoding failed - */ - private void encodeCodecData(final RevisionCodecData codecData) throws EncodingException { - - this.codecData = codecData; - - // C - data.writeBit(0); - data.writeBit(0); - data.writeBit(0); - - // BLOCK SIZES - S E B L - this.data.writeValue(5, codecData.getBlocksizeS()); - this.data.writeValue(5, codecData.getBlocksizeE()); - this.data.writeValue(5, codecData.getBlocksizeB()); - this.data.writeValue(5, codecData.getBlocksizeL()); - - // 1 Bit - data.writeFillBits(); - } - - /** - * Encodes a Cut operation. - * - * @param part Reference to the Cut operation - * @throws EncodingException if the encoding failed - */ - private void encodeCut(final DiffPart part) throws EncodingException { - - // C - data.writeBit(1); - data.writeBit(0); - data.writeBit(1); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // E - data.writeValue(codecData.getBlocksizeE(), part.getLength()); - - // B - data.writeValue(codecData.getBlocksizeB(), - Integer.parseInt(part.getText())); - - data.writeFillBits(); - - } - - /** - * Encodes a Delete operation. - * - * @param part Reference to the Delete operation - * @throws EncodingException if the encoding failed - */ - private void encodeDelete(final DiffPart part) throws EncodingException { - - // C - data.writeBit(0); - data.writeBit(1); - data.writeBit(1); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // E - data.writeValue(codecData.getBlocksizeE(), part.getLength()); - - data.writeFillBits(); - } - - @Override - public String encodeDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException { - - String sEncoding; - byte[] bData = encode(codecData, diff); - Base64.Encoder encoder = Base64.getEncoder(); - if (MODE_ZIP_COMPRESSION) { - - Deflater compresser = new Deflater(); - compresser.setInput(bData); - compresser.finish(); - - byte[] output = new byte[1000]; - ByteArrayOutputStream stream = new ByteArrayOutputStream(); - - int cLength; - do { - cLength = compresser.deflate(output); - stream.write(output, 0, cLength); - } - while (cLength == 1000); - - output = stream.toByteArray(); - - if (bData.length + 1 < output.length) { - sEncoding = encoder.encodeToString(bData); - } else { - sEncoding = "_" + encoder.encodeToString(output); - } - } else { - sEncoding = encoder.encodeToString(bData); + /** + * Encodes an Insert operation. + * + * @param part + * Reference to the Insert operation + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws EncodingException + * if the encoding failed + */ + private void encodeInsert(final DiffPart part) + throws UnsupportedEncodingException, EncodingException + { + + // C + data.writeBit(0); + data.writeBit(1); + data.writeBit(0); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // L T + String text = part.getText(); + byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); + + data.writeValue(codecData.getBlocksizeL(), bText.length); + data.write(bText); } - return sEncoding; - } - - /** - * Encodes a FullRevision operation. - * - * @param part Reference to the FullRevision operation - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws EncodingException if the encoding failed - */ - private void encodeFullRevisionUncompressed(final DiffPart part) - throws UnsupportedEncodingException, EncodingException { - - // C - data.writeBit(0); - data.writeBit(0); - data.writeBit(1); - - // L T - String text = part.getText(); - byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); - - data.writeValue(codecData.getBlocksizeL(), bText.length); - data.write(bText); - - } - - /** - * Encodes an Insert operation. - * - * @param part Reference to the Insert operation - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws EncodingException if the encoding failed - */ - private void encodeInsert(final DiffPart part) throws UnsupportedEncodingException, EncodingException { - - // C - data.writeBit(0); - data.writeBit(1); - data.writeBit(0); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // L T - String text = part.getText(); - byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); - - data.writeValue(codecData.getBlocksizeL(), bText.length); - data.write(bText); - } - - /** - * Encodes a Paste operation. - * - * @param part Reference to the Paste operation - * @throws EncodingException if the encoding failed - */ - private void encodePaste(final DiffPart part) throws EncodingException { - - // C - data.writeBit(1); - data.writeBit(1); - data.writeBit(0); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // B - data.writeValue(codecData.getBlocksizeB(), - Integer.parseInt(part.getText())); - - data.writeFillBits(); - } - - /** - * Encodes a Replace operation. - * - * @param part Reference to the replace operation - * @throws UnsupportedEncodingException if the character encoding is unsupported - * @throws EncodingException if the encoding failed - */ - private void encodeReplace(final DiffPart part) throws UnsupportedEncodingException, EncodingException { - - // C - data.writeBit(1); - data.writeBit(0); - data.writeBit(0); - - // S - data.writeValue(codecData.getBlocksizeS(), part.getStart()); - - // E - data.writeValue(codecData.getBlocksizeE(), part.getLength()); - - // L T - String text = part.getText(); - byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); - - data.writeValue(codecData.getBlocksizeL(), bText.length); - data.write(bText); - } + /** + * Encodes a Paste operation. + * + * @param part + * Reference to the Paste operation + * @throws EncodingException + * if the encoding failed + */ + private void encodePaste(final DiffPart part) throws EncodingException + { + + // C + data.writeBit(1); + data.writeBit(1); + data.writeBit(0); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // B + data.writeValue(codecData.getBlocksizeB(), Integer.parseInt(part.getText())); + + data.writeFillBits(); + } + + /** + * Encodes a Replace operation. + * + * @param part + * Reference to the replace operation + * @throws UnsupportedEncodingException + * if the character encoding is unsupported + * @throws EncodingException + * if the encoding failed + */ + private void encodeReplace(final DiffPart part) + throws UnsupportedEncodingException, EncodingException + { + + // C + data.writeBit(1); + data.writeBit(0); + data.writeBit(0); + + // S + data.writeValue(codecData.getBlocksizeS(), part.getStart()); + + // E + data.writeValue(codecData.getBlocksizeE(), part.getLength()); + + // L T + String text = part.getText(); + byte[] bText = text.getBytes(WIKIPEDIA_ENCODING); + + data.writeValue(codecData.getBlocksizeL(), bText.length); + data.write(bText); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java index 12cc593e..cdd2bfcd 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/codec/RevisionEncoderInterface.java @@ -25,32 +25,39 @@ /** * The RevisionEncoderInterface describes the link to the diff encoding unit. */ -public interface RevisionEncoderInterface { +public interface RevisionEncoderInterface +{ - /** - * Returns the textual encoding of the given Diff. - * - * @param codecData CodecData used to encode the diff-data - * @param diff diff-data - * @return base 64 encoded diff - * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not - * supported by JAVA. - * @throws EncodingException if the encoding process fails - */ - String encodeDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException; + /** + * Returns the textual encoding of the given Diff. + * + * @param codecData + * CodecData used to encode the diff-data + * @param diff + * diff-data + * @return base 64 encoded diff + * @throws UnsupportedEncodingException + * if the CharacterSet defined in the configuration is not supported by JAVA. + * @throws EncodingException + * if the encoding process fails + */ + String encodeDiff(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException; - /** - * Returns the binary encoding of the given Diff. - * - * @param codecData CodecData used to encode the diff-data - * @param diff diff-data - * @return binary encoded diff - * @throws UnsupportedEncodingException if the CharacterSet defined in the configuration is not - * supported by JAVA. - * @throws EncodingException if the encoding process fails - */ - byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) - throws UnsupportedEncodingException, EncodingException; + /** + * Returns the binary encoding of the given Diff. + * + * @param codecData + * CodecData used to encode the diff-data + * @param diff + * diff-data + * @return binary encoded diff + * @throws UnsupportedEncodingException + * if the CharacterSet defined in the configuration is not supported by JAVA. + * @throws EncodingException + * if the encoding process fails + */ + byte[] binaryDiff(final RevisionCodecData codecData, final Diff diff) + throws UnsupportedEncodingException, EncodingException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java index ee87c504..9abdb6fc 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/ISizeable.java @@ -20,12 +20,13 @@ /** * This interface defines a method for size estimations. */ -public interface ISizeable { +public interface ISizeable +{ - /** - * This method should return a size estimation of the data. - * - * @return size estimation in byte - */ - long byteSize(); + /** + * This method should return a size estimation of the data. + * + * @return size estimation in byte + */ + long byteSize(); } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java index 8019bf4b..0ecf37b5 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/Task.java @@ -25,231 +25,252 @@ /** * The task class contains the information of a task. * - * @param Class of data the task contains + * @param + * Class of data the task contains */ -public class Task { - - /* - * +STATICS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Creates a dummy task without data. - * - * @return dummy task - */ - @SuppressWarnings("rawtypes") - public static Task createDummy() { - return new Task(TaskTypes.DUMMY); - } - - /** - * Creates an end task. - * - * @return end task - */ - @SuppressWarnings("rawtypes") - public static Task createEndTask() { - return new Task(TaskTypes.ENDTASK); - } - - /** - * Creates a banned task. - * - * @return banned task - */ - @SuppressWarnings("rawtypes") - public static Task createBannedTask() { - return new Task(TaskTypes.BANNED_TASK); - } - - /* - * +ATTRIBUTES+AND+CONSTRUCTORS++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Type of the task - */ - private TaskTypes taskType; - - /** - * Additional information concerning the article - */ - private ArticleInformation header; - - /** - * Data of the task - */ - private final ArrayList container; - - /** - * Counter of the task parts (1-based) - */ - private final int partCounter; - - /** - * Size of this task - */ - private int byteSize; - - /** - * Constructor - A new task object of the specified type will be created. - * - * @param taskType Type of task - */ - protected Task(final TaskTypes taskType) { - this.taskType = taskType; - this.container = null; - - this.byteSize = 0; - this.partCounter = 0; - } - - /** - * Constructor - A new task object of the type TASK_FULL will be created. - * - * @param header reference to the article information - * @param taskPartCounter task part counter - */ - public Task(final ArticleInformation header, final int taskPartCounter) { - this.header = header; - - this.byteSize = 0; - this.partCounter = taskPartCounter; - this.taskType = TaskTypes.TASK_FULL; - - this.container = new ArrayList<>(); - } - - /* - * +METHODS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Returns the reference to the article header. - */ - public ArticleInformation getHeader() { - return this.header; - } - - /** - * Returns the type of this task. - * - * @return TaskType - */ - public TaskTypes getTaskType() { - return this.taskType; - } - - /** - * Adds data to this task. - * - * @param data Reference to the data object. - */ - public void add(final D data) { - this.container.add(data); - - // if the size of data is known add the value to the task size - if (data instanceof ISizeable) { - this.byteSize += ((ISizeable) data).byteSize(); - } - } - - /** - * Returns the data of this task. - * - * @return data - */ - public ArrayList getContainer() { - return this.container; - } - - /** - * Returns the data at the specified index. - *

- * The index will not be check whether it is out of range or not. If you do - * not know the appropriate index call the size() method before calling this - * method. - * - * @param index index - * @return data - */ - public D get(final int index) { - return this.container.get(index); - } - - /** - * Returns the number of data parts the task contains. - * - * @return number of data parts. - */ - public int size() { - return this.container.size(); - } - - /** - * Returns an iterator over the data. - * - * @return Iterator - */ - public Iterator iterator() { - return this.container.iterator(); - } - - /** - * Returns the size estimation of this task in bytes. - *

- * The size can only be estimated if the data contains the ISizeable - * interface. - * - * @return size estimation - */ - public int byteSize() { - return this.byteSize; - } - - /** - * Returns the type of the task. - * - * @param taskType TaskType - */ - public void setTaskType(final TaskTypes taskType) { - this.taskType = taskType; - } - - /** - * Returns the part counter. - * - * @return Part counter - */ - public int getPartCounter() { - return this.partCounter; - } - - /** - * Returns an unique task identifier consisting of article id and part - * counter. - * - * @return unique task identifier - */ - public String uniqueIdentifier() { - return this.header.getArticleId() + "-" + this.partCounter; - } - - /* - * +DELEGATERS+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ - */ - - /** - * Returns a string representation of the task. - * - * @return string representation - */ - @Override - public String toString() { - return "[" + this.taskType.toString() + " <" + this.partCounter + ">" - + "\t" + this.byteSize + "\t| " + this.header.getArticleId() - + "\tR" + this.container.size() + "\t" - + this.header.getArticleName() + "]"; - } +public class Task +{ + + /* + * +STATICS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Creates a dummy task without data. + * + * @return dummy task + */ + @SuppressWarnings("rawtypes") + public static Task createDummy() + { + return new Task(TaskTypes.DUMMY); + } + + /** + * Creates an end task. + * + * @return end task + */ + @SuppressWarnings("rawtypes") + public static Task createEndTask() + { + return new Task(TaskTypes.ENDTASK); + } + + /** + * Creates a banned task. + * + * @return banned task + */ + @SuppressWarnings("rawtypes") + public static Task createBannedTask() + { + return new Task(TaskTypes.BANNED_TASK); + } + + /* + * +ATTRIBUTES+AND+CONSTRUCTORS++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Type of the task + */ + private TaskTypes taskType; + + /** + * Additional information concerning the article + */ + private ArticleInformation header; + + /** + * Data of the task + */ + private final ArrayList container; + + /** + * Counter of the task parts (1-based) + */ + private final int partCounter; + + /** + * Size of this task + */ + private int byteSize; + + /** + * Constructor - A new task object of the specified type will be created. + * + * @param taskType + * Type of task + */ + protected Task(final TaskTypes taskType) + { + this.taskType = taskType; + this.container = null; + + this.byteSize = 0; + this.partCounter = 0; + } + + /** + * Constructor - A new task object of the type TASK_FULL will be created. + * + * @param header + * reference to the article information + * @param taskPartCounter + * task part counter + */ + public Task(final ArticleInformation header, final int taskPartCounter) + { + this.header = header; + + this.byteSize = 0; + this.partCounter = taskPartCounter; + this.taskType = TaskTypes.TASK_FULL; + + this.container = new ArrayList<>(); + } + + /* + * +METHODS++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Returns the reference to the article header. + */ + public ArticleInformation getHeader() + { + return this.header; + } + + /** + * Returns the type of this task. + * + * @return TaskType + */ + public TaskTypes getTaskType() + { + return this.taskType; + } + + /** + * Adds data to this task. + * + * @param data + * Reference to the data object. + */ + public void add(final D data) + { + this.container.add(data); + + // if the size of data is known add the value to the task size + if (data instanceof ISizeable) { + this.byteSize += ((ISizeable) data).byteSize(); + } + } + + /** + * Returns the data of this task. + * + * @return data + */ + public ArrayList getContainer() + { + return this.container; + } + + /** + * Returns the data at the specified index. + *

+ * The index will not be check whether it is out of range or not. If you do not know the + * appropriate index call the size() method before calling this method. + * + * @param index + * index + * @return data + */ + public D get(final int index) + { + return this.container.get(index); + } + + /** + * Returns the number of data parts the task contains. + * + * @return number of data parts. + */ + public int size() + { + return this.container.size(); + } + + /** + * Returns an iterator over the data. + * + * @return Iterator + */ + public Iterator iterator() + { + return this.container.iterator(); + } + + /** + * Returns the size estimation of this task in bytes. + *

+ * The size can only be estimated if the data contains the ISizeable interface. + * + * @return size estimation + */ + public int byteSize() + { + return this.byteSize; + } + + /** + * Returns the type of the task. + * + * @param taskType + * TaskType + */ + public void setTaskType(final TaskTypes taskType) + { + this.taskType = taskType; + } + + /** + * Returns the part counter. + * + * @return Part counter + */ + public int getPartCounter() + { + return this.partCounter; + } + + /** + * Returns an unique task identifier consisting of article id and part counter. + * + * @return unique task identifier + */ + public String uniqueIdentifier() + { + return this.header.getArticleId() + "-" + this.partCounter; + } + + /* + * +DELEGATERS+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + */ + + /** + * Returns a string representation of the task. + * + * @return string representation + */ + @Override + public String toString() + { + return "[" + this.taskType.toString() + " <" + this.partCounter + ">" + "\t" + this.byteSize + + "\t| " + this.header.getArticleId() + "\tR" + this.container.size() + "\t" + + this.header.getArticleName() + "]"; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java index 0c9a89bf..e0825d1c 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/TaskTypes.java @@ -20,40 +20,41 @@ /** * This Enumerator lists the different types of tasks. */ -public enum TaskTypes { - - /** - * dummy task - */ - DUMMY, - - /** - * if this task is received from a consumer, it will shutdown afterwards - */ - ENDTASK, - - /** - * if the article id is black listed - */ - BANNED_TASK, - - /** - * full task containing all revisions of one article - */ - TASK_FULL, - - /** - * task containing the first part of revisions of one article - */ - TASK_PARTIAL_FIRST, - - /** - * task containing some revisions of one article - */ - TASK_PARTIAL, - - /** - * task containing the last part of revisions from one article - */ - TASK_PARTIAL_LAST +public enum TaskTypes +{ + + /** + * dummy task + */ + DUMMY, + + /** + * if this task is received from a consumer, it will shutdown afterwards + */ + ENDTASK, + + /** + * if the article id is black listed + */ + BANNED_TASK, + + /** + * full task containing all revisions of one article + */ + TASK_FULL, + + /** + * task containing the first part of revisions of one article + */ + TASK_PARTIAL_FIRST, + + /** + * task containing some revisions of one article + */ + TASK_PARTIAL, + + /** + * task containing the last part of revisions from one article + */ + TASK_PARTIAL_LAST } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java index ee58e403..49233157 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/Diff.java @@ -30,331 +30,361 @@ * This class contains the diff information used to create single revision. */ public class Diff - implements ISizeable { - - /** - * Reference to the codec - */ - private RevisionCodecData codecData; - - /** - * List of DiffParts - */ - private final List parts; - - /** - * Revision counter - */ - private int revisionCounter; - - /** - * Revision ID - */ - private int revisionID; - - /** - * Timestamp - */ - private Timestamp timeStamp; - - /** - * Username/IP of the contributor who created this revision - */ - private String contributorName; - - /** - * ID of the contributor who created this revision - */ - private Integer contributorId; - - /** - * Determine whether the contributor was registered. - * True: contributorName= username - * False: contributorName= IP - */ - private boolean contributorIsRegistered; - - /** - * The user comment for this revision - */ - private String comment; - - /** - * Determine whether revision is a minor revision - */ - private boolean isMinor = false; - - /** - * (Constructor) Creates a new empty Diff. - */ - public Diff() { - this.parts = new ArrayList<>(); - } - - /** - * Adds a DiffPart. - * - * @param diff DiffPart - */ - public void add(final DiffPart diff) { - this.parts.add(diff); - } - - /** - * Builds the current revision. - * - * @param previousRevision content of the previous revision - * @return current revision - */ - public String buildRevision(final char[] previousRevision) { - String prevRev = null; - if (previousRevision != null) { - prevRev = String.valueOf(previousRevision); + implements ISizeable +{ + + /** + * Reference to the codec + */ + private RevisionCodecData codecData; + + /** + * List of DiffParts + */ + private final List parts; + + /** + * Revision counter + */ + private int revisionCounter; + + /** + * Revision ID + */ + private int revisionID; + + /** + * Timestamp + */ + private Timestamp timeStamp; + + /** + * Username/IP of the contributor who created this revision + */ + private String contributorName; + + /** + * ID of the contributor who created this revision + */ + private Integer contributorId; + + /** + * Determine whether the contributor was registered. True: contributorName= username False: + * contributorName= IP + */ + private boolean contributorIsRegistered; + + /** + * The user comment for this revision + */ + private String comment; + + /** + * Determine whether revision is a minor revision + */ + private boolean isMinor = false; + + /** + * (Constructor) Creates a new empty Diff. + */ + public Diff() + { + this.parts = new ArrayList<>(); } - return buildRevision(prevRev); - } + /** + * Adds a DiffPart. + * + * @param diff + * DiffPart + */ + public void add(final DiffPart diff) + { + this.parts.add(diff); + } + + /** + * Builds the current revision. + * + * @param previousRevision + * content of the previous revision + * @return current revision + */ + public String buildRevision(final char[] previousRevision) + { + String prevRev = null; + if (previousRevision != null) { + prevRev = String.valueOf(previousRevision); + } + + return buildRevision(prevRev); + } + + /** + * Builds the current revision. + * + * @param previousRevision + * content of the previous revision + * @return current revision + */ + public String buildRevision(final String previousRevision) + { + + HashMap bufferMap = new HashMap<>(); + + StringBuilder output = new StringBuilder(); + if (previousRevision != null) { + output.append(previousRevision); + } + + int size = parts.size(); + DiffPart part; + + for (int i = 0; i < size; i++) { + + part = parts.get(i); + + switch (part.getAction()) { + case FULL_REVISION_UNCOMPRESSED: + output = new StringBuilder(); + output.insert(0, part.getText()); + break; + case INSERT: + output.insert(part.getStart(), part.getText()); + break; + case DELETE: + output.delete(part.getStart(), part.getEnd()); + break; + case REPLACE: + output.replace(part.getStart(), part.getEnd(), part.getText()); + break; + case CUT: + bufferMap.put(part.getText(), output.substring(part.getStart(), part.getEnd())); + output.delete(part.getStart(), part.getEnd()); + break; + case PASTE: + output.insert(part.getStart(), bufferMap.remove(part.getText())); + break; + default: + throw new RuntimeException("UNKNOWN PART ACTION"); + } + } + + return output.toString(); + } + + /** + * Returns an estimation of the size used to stored the data. + * + * @return estimated size + */ + public long byteSize() + { - /** - * Builds the current revision. - * - * @param previousRevision content of the previous revision - * @return current revision - */ - public String buildRevision(final String previousRevision) { + long byteSize = 3; - HashMap bufferMap = new HashMap<>(); + int size = parts.size(); - StringBuilder output = new StringBuilder(); - if (previousRevision != null) { - output.append(previousRevision); + for (int i = 0; i < size; i++) { + byteSize += this.parts.get(i).byteSize(); + } + + return byteSize; } - int size = parts.size(); - DiffPart part; - - for (int i = 0; i < size; i++) { - - part = parts.get(i); - - switch (part.getAction()) { - case FULL_REVISION_UNCOMPRESSED: - output = new StringBuilder(); - output.insert(0, part.getText()); - break; - case INSERT: - output.insert(part.getStart(), part.getText()); - break; - case DELETE: - output.delete(part.getStart(), part.getEnd()); - break; - case REPLACE: - output.replace(part.getStart(), part.getEnd(), part.getText()); - break; - case CUT: - bufferMap.put(part.getText(), - output.substring(part.getStart(), part.getEnd())); - output.delete(part.getStart(), part.getEnd()); - break; - case PASTE: - output.insert(part.getStart(), bufferMap.remove(part.getText())); - break; - default: - throw new RuntimeException("UNKNOWN PART ACTION"); - } + /** + * Returns the referenced diff part. + * + * @param index + * index of the diff part + * @return diff part + */ + public DiffPart get(final int index) + { + return this.parts.get(index); } - return output.toString(); - } + /** + * Returns the codec data. + * + * @return codec + */ + public RevisionCodecData getCodecData() + { + return codecData; + } - /** - * Returns an estimation of the size used to stored the data. - * - * @return estimated size - */ - public long byteSize() { + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() + { + return this.revisionCounter; + } - long byteSize = 3; + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getRevisionID() + */ + public int getRevisionID() + { + return revisionID; + } - int size = parts.size(); + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getTimeStamp() + */ + public Timestamp getTimeStamp() + { + return timeStamp; + } - for (int i = 0; i < size; i++) { - byteSize += this.parts.get(i).byteSize(); + /** + * Returns whether the revision described by this diff is a full revision or not. + * + * @return TRUE | FALSE + */ + public boolean isFullRevision() + { + if (this.parts.size() == 1) { + DiffPart p = this.parts.get(0); + if (p.getAction() == DiffAction.FULL_REVISION_UNCOMPRESSED) { + return true; + } + } + + return false; } - return byteSize; - } - - /** - * Returns the referenced diff part. - * - * @param index index of the diff part - * @return diff part - */ - public DiffPart get(final int index) { - return this.parts.get(index); - } - - /** - * Returns the codec data. - * - * @return codec - */ - public RevisionCodecData getCodecData() { - return codecData; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() { - return this.revisionCounter; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getRevisionID() - */ - public int getRevisionID() { - return revisionID; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#getTimeStamp() - */ - public Timestamp getTimeStamp() { - return timeStamp; - } - - /** - * Returns whether the revision described by this diff is a full revision or - * not. - * - * @return TRUE | FALSE - */ - public boolean isFullRevision() { - if (this.parts.size() == 1) { - DiffPart p = this.parts.get(0); - if (p.getAction() == DiffAction.FULL_REVISION_UNCOMPRESSED) { - return true; - } + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#iterator() + */ + public Iterator iterator() + { + return this.parts.iterator(); } - return false; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#iterator() - */ - public Iterator iterator() { - return this.parts.iterator(); - } - - /** - * Sets the codec data. - * - * @param codecData coded data - */ - public void setCodecData(final RevisionCodecData codecData) { - this.codecData = codecData; - } - - /** - * Sets the revision counter. - * - * @param revisionCounter revision counter - */ - public void setRevisionCoutner(final int revisionCounter) { - this.revisionCounter = revisionCounter; - } - - /* - * (non-Javadoc) - * - * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#setRevisionID(int) - */ - public void setRevisionID(final int revisionID) { - this.revisionID = revisionID; - } - - /* - * (non-Javadoc) - * - * @see - * de.tud.ukp.kulessa.delta.data.IRevisionChange#setTimeStamp(java.lang. - * String) - */ - public void setTimeStamp(final Timestamp timeStamp) { - this.timeStamp = timeStamp; - } - - /** - * Returns the number of stored diff parts. - * - * @return number of diff parts - */ - public int size() { - return this.parts.size(); - } - - /** - * Returns the string representation of the diff content. - * - * @return string representation of the diff parts - */ - @Override - public String toString() { - StringBuilder builder = new StringBuilder(); - for (int i = 0; i < parts.size(); i++) { - builder.append(parts.get(i).toString() + "\n"); + /** + * Sets the codec data. + * + * @param codecData + * coded data + */ + public void setCodecData(final RevisionCodecData codecData) + { + this.codecData = codecData; } - return builder.toString(); - } - public void setComment(String comment) { - this.comment = comment; - } + /** + * Sets the revision counter. + * + * @param revisionCounter + * revision counter + */ + public void setRevisionCoutner(final int revisionCounter) + { + this.revisionCounter = revisionCounter; + } - public String getComment() { - return comment; - } + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#setRevisionID(int) + */ + public void setRevisionID(final int revisionID) + { + this.revisionID = revisionID; + } - public void setMinor(boolean isMinor) { - this.isMinor = isMinor; - } + /* + * (non-Javadoc) + * + * @see de.tud.ukp.kulessa.delta.data.IRevisionChange#setTimeStamp(java.lang. String) + */ + public void setTimeStamp(final Timestamp timeStamp) + { + this.timeStamp = timeStamp; + } - public boolean isMinor() { - return isMinor; - } + /** + * Returns the number of stored diff parts. + * + * @return number of diff parts + */ + public int size() + { + return this.parts.size(); + } - public void setContributorName(String contributorName) { - this.contributorName = contributorName; - } + /** + * Returns the string representation of the diff content. + * + * @return string representation of the diff parts + */ + @Override + public String toString() + { + StringBuilder builder = new StringBuilder(); + for (int i = 0; i < parts.size(); i++) { + builder.append(parts.get(i).toString() + "\n"); + } + return builder.toString(); + } - public String getContributorName() { - return contributorName; - } + public void setComment(String comment) + { + this.comment = comment; + } + + public String getComment() + { + return comment; + } + + public void setMinor(boolean isMinor) + { + this.isMinor = isMinor; + } - public void setContributorIsRegistered(boolean contributorIsRegistered) { - this.contributorIsRegistered = contributorIsRegistered; - } + public boolean isMinor() + { + return isMinor; + } - public boolean getContributorIsRegistered() { - return contributorIsRegistered; - } + public void setContributorName(String contributorName) + { + this.contributorName = contributorName; + } - public void setContributorId(Integer contributorId) { - this.contributorId = contributorId; - } + public String getContributorName() + { + return contributorName; + } - public Integer getContributorId() { - return contributorId; - } + public void setContributorIsRegistered(boolean contributorIsRegistered) + { + this.contributorIsRegistered = contributorIsRegistered; + } + + public boolean getContributorIsRegistered() + { + return contributorIsRegistered; + } + + public void setContributorId(Integer contributorId) + { + this.contributorId = contributorId; + } + + public Integer getContributorId() + { + return contributorId; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java index cdc1f931..ea32287a 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffAction.java @@ -26,101 +26,106 @@ /** * This class contains the constants for the DiffActions. */ -public enum DiffAction implements Serializable { +public enum DiffAction + implements Serializable +{ - /** - * Codec - */ - DECODER_DATA((byte) 0), + /** + * Codec + */ + DECODER_DATA((byte) 0), - /** - * Full Revision - */ - FULL_REVISION_UNCOMPRESSED((byte) 1), + /** + * Full Revision + */ + FULL_REVISION_UNCOMPRESSED((byte) 1), - /** - * Insert operation - */ - INSERT((byte) 2), + /** + * Insert operation + */ + INSERT((byte) 2), - /** - * Delete operation - */ - DELETE((byte) 3), + /** + * Delete operation + */ + DELETE((byte) 3), - /** - * Replace operation - */ - REPLACE((byte) 4), + /** + * Replace operation + */ + REPLACE((byte) 4), - /** - * Cut operation - */ - CUT((byte) 5), + /** + * Cut operation + */ + CUT((byte) 5), - /** - * Paste operation - */ - PASTE((byte) 6)/* - * , - * - * FULL_REVISION_COMPRESSED((byte)7) - */; + /** + * Paste operation + */ + PASTE((byte) 6)/* + * , + * + * FULL_REVISION_COMPRESSED((byte)7) + */; - /** - * byte constant - */ - private final byte code; + /** + * byte constant + */ + private final byte code; - /** - * Creates a DiffAction. - * - * @param code byte constant - */ - DiffAction(final byte code) { - this.code = code; - } + /** + * Creates a DiffAction. + * + * @param code + * byte constant + */ + DiffAction(final byte code) + { + this.code = code; + } - /** - * Returns the byte constant - * - * @return value of the constant - */ - public byte getValue() { - return code; - } + /** + * Returns the byte constant + * + * @return value of the constant + */ + public byte getValue() + { + return code; + } - /** - * Returns the appropriate DiffAction value. - * - * @param val byte value - * @return DiffAction - * @throws DecodingException if the value does not match one of the predefined byte - * constants - */ - public static DiffAction parse(final int val) - throws DecodingException { + /** + * Returns the appropriate DiffAction value. + * + * @param val + * byte value + * @return DiffAction + * @throws DecodingException + * if the value does not match one of the predefined byte constants + */ + public static DiffAction parse(final int val) throws DecodingException + { - switch (val) { - case 0: - return DECODER_DATA; - case 1: - return FULL_REVISION_UNCOMPRESSED; - case 2: - return INSERT; - case 3: - return DELETE; - case 4: - return REPLACE; - case 5: - return CUT; - case 6: - return PASTE; - // case 7: return FULL_REVISION_COMPRESSED; - default: - throw ErrorFactory.createDecodingException( - ErrorKeys.DIFFTOOL_ENCODING_INVALID_VALUE, - "Invalid value: " + val); + switch (val) { + case 0: + return DECODER_DATA; + case 1: + return FULL_REVISION_UNCOMPRESSED; + case 2: + return INSERT; + case 3: + return DELETE; + case 4: + return REPLACE; + case 5: + return CUT; + case 6: + return PASTE; + // case 7: return FULL_REVISION_COMPRESSED; + default: + throw ErrorFactory.createDecodingException(ErrorKeys.DIFFTOOL_ENCODING_INVALID_VALUE, + "Invalid value: " + val); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java index 85d0db84..22c2ae64 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/content/DiffPart.java @@ -20,159 +20,179 @@ import java.io.Serializable; /** - * The DiffPart class represents the operation used to create a new revision - * from an older revision. + * The DiffPart class represents the operation used to create a new revision from an older revision. */ -public class DiffPart implements Serializable { - - private static final long serialVersionUID = 6208903899064982679L; - - /** - * Start position of the text block - */ - private int start; - - /** - * Lengthof the text block - */ - private int length; - - /** - * DiffAction value - */ - private final DiffAction action; - - /** - * Textual information - */ - private String text; - - /** - * (Constructor) Creates a new DiffPart object. - * - * @param action DiffAction - */ - public DiffPart(final DiffAction action) { - - this.action = action; - } - - /** - * Returns the length of the text block. - * - * @return length of the text block - */ - public int getLength() { - return length; - } - - /** - * Sets the length of the text block. - * - * @param length length of the text block - */ - public void setLength(final int length) { - this.length = length; - } - - /** - * Returns the start position of the text block. - * - * @return start position - */ - public int getStart() { - return start; - } - - /** - * Returns the end position of the text block. - * - * @return end position - */ - public int getEnd() { - return start + length; - } - - /** - * Sets the start position of the text block. - * - * @param start start position - */ - public void setStart(final int start) { - this.start = start; - } - - /** - * Sets the textual information. - * - * @param text content - */ - public void setText(final String text) { - this.text = text; - } - - /** - * Returns the DiffAction value. - * - * @return DiffAction - */ - public DiffAction getAction() { - return this.action; - } - - /** - * Returns the textual information. - * - * @return content - */ - public String getText() { - return this.text; - } - - /** - * Returns a representation of the DiffAction content. - * - * @return [ DiffAction, start position, length, content ] - */ - @Override - public String toString() { - return "[" + action + " " + start + " " + length + " " + text + "]\n"; - } - - /** - * Returns the estimated number of bytes used to encode the contained - * information. - * - * @return estimated size in bytes - */ - public int byteSize() { - if (text == null) { - return 9; +public class DiffPart + implements Serializable +{ + + private static final long serialVersionUID = 6208903899064982679L; + + /** + * Start position of the text block + */ + private int start; + + /** + * Lengthof the text block + */ + private int length; + + /** + * DiffAction value + */ + private final DiffAction action; + + /** + * Textual information + */ + private String text; + + /** + * (Constructor) Creates a new DiffPart object. + * + * @param action + * DiffAction + */ + public DiffPart(final DiffAction action) + { + + this.action = action; } - return 9 + text.length(); - } - - /* (non-Javadoc) - * @see java.lang.Object#equals(java.lang.Object) - * - * DiffParts are equal if their text, actions and spans are equal - */ - @Override - public boolean equals(Object anObject) { - - if (!(anObject instanceof DiffPart)) { - return false; - } else { - DiffPart otherRev = (DiffPart) anObject; - if (this.getText().equals(otherRev.getText()) - && this.getAction() == otherRev.getAction() - && this.getStart() == otherRev.getStart() - && this.getEnd() == otherRev.getEnd()) { - return true; - } else { - return false; - } + + /** + * Returns the length of the text block. + * + * @return length of the text block + */ + public int getLength() + { + return length; + } + + /** + * Sets the length of the text block. + * + * @param length + * length of the text block + */ + public void setLength(final int length) + { + this.length = length; + } + + /** + * Returns the start position of the text block. + * + * @return start position + */ + public int getStart() + { + return start; + } + + /** + * Returns the end position of the text block. + * + * @return end position + */ + public int getEnd() + { + return start + length; + } + + /** + * Sets the start position of the text block. + * + * @param start + * start position + */ + public void setStart(final int start) + { + this.start = start; + } + + /** + * Sets the textual information. + * + * @param text + * content + */ + public void setText(final String text) + { + this.text = text; + } + + /** + * Returns the DiffAction value. + * + * @return DiffAction + */ + public DiffAction getAction() + { + return this.action; + } + + /** + * Returns the textual information. + * + * @return content + */ + public String getText() + { + return this.text; + } + + /** + * Returns a representation of the DiffAction content. + * + * @return [ DiffAction, start position, length, content ] + */ + @Override + public String toString() + { + return "[" + action + " " + start + " " + length + " " + text + "]\n"; + } + + /** + * Returns the estimated number of bytes used to encode the contained information. + * + * @return estimated size in bytes + */ + public int byteSize() + { + if (text == null) { + return 9; + } + return 9 + text.length(); + } + + /* + * (non-Javadoc) + * + * @see java.lang.Object#equals(java.lang.Object) + * + * DiffParts are equal if their text, actions and spans are equal + */ + @Override + public boolean equals(Object anObject) + { + + if (!(anObject instanceof DiffPart)) { + return false; + } + else { + DiffPart otherRev = (DiffPart) anObject; + if (this.getText().equals(otherRev.getText()) + && this.getAction() == otherRev.getAction() + && this.getStart() == otherRev.getStart() + && this.getEnd() == otherRev.getEnd()) { + return true; + } + else { + return false; + } + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java index 6e883ed7..3ab76cd6 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/difftool/data/tasks/info/ArticleInformation.java @@ -23,428 +23,474 @@ /** * This class contains all statistical information related to one article. */ -public class ArticleInformation { - - /** - * Article ID - */ - private int articleId; - - /** - * Name of the article - */ - private String articleName; - - /** - * Diffed size of the article - */ - private long diffedSize; - - /** - * Number of diff parts used - */ - private int diffPartCounter; - - /** - * Encoded size of the article - */ - private long encodedSize; - - /** - * UNCOMPRESSED encoded size of the article - */ - private long encodedSQLSize; - - /** - * Time the task entered the system - */ - private long enteringTime; - - /** - * Time the task exited the system - */ - private long exitingTime; - - /** - * Number of ignored revisions - */ - private int ignoredRevisionsCounter; - - /** - * Original size of the article - */ - private long originalSize; - - /** - * Time used to diff the task - */ - private long processingTimeDiff; - - /** - * Time used to read the task - */ - private long processingTimeRead; - - /** - * Time used to encode the task - */ - private long processingTimeSQL; - - /** - * Number of parsed revisions related to this article - */ - private int readRevisionCounter; - - /** - * Value of the revision counter after finishing the diff processing - */ - private int revisionCounter; - - /** - * (Constructor) Creates a new ArticleInformation object. - */ - public ArticleInformation() { - this.articleId = -1; - // this.timeStamp = null; - this.articleName = null; - - this.revisionCounter = 0; - this.ignoredRevisionsCounter = 0; - this.diffPartCounter = 0; - - this.originalSize = 0; - this.diffedSize = 0; - this.encodedSize = 0; - this.encodedSQLSize = 0; - - this.enteringTime = 0; - this.exitingTime = 0; - } - - /** - * Returns the ID of the article. - * - * @return Article ID - */ - public int getArticleId() { - return articleId; - } - - /** - * Returns the name of the article. - * - * @return Article name - */ - public String getArticleName() { - return articleName; - } - - /** - * Returns the diffed size of the article. - * - * @return diffed size - */ - public long getDiffedSize() { - return diffedSize; - } - - /** - * Returns the number of diff parts. - * - * @return number of diff parts - */ - public int getDiffPartCounter() { - return diffPartCounter; - } - - /** - * Returns the encoded size of the article. - * - * @return encoded size - */ - public long getEncodedSize() { - return encodedSize; - } - - /** - * Returns the size of the article after the sql encoding. - * - * @return size after encoding - */ - public long getEncodedSQLSize() { - return encodedSQLSize; - } - - /** - * Returns the entering time. - * - * @return entering time - */ - public long getEnteringTime() { - return enteringTime; - } - - /** - * Returns the exiting time. - * - * @return exiting time - */ - public long getExitingTime() { - return exitingTime; - } - - /** - * Returns the number of ignored revisions. - * - * @return number of ignored revisions - */ - public int getIgnoredRevisionsCounter() { - return ignoredRevisionsCounter; - } - - /** - * Returns the original size of the article. - * - * @return original size - */ - public long getOriginalSize() { - return originalSize; - } - - /** - * Returns the time used for the diff encoding. - * - * @return processing time diff - */ - public long getProcessingTimeDiff() { - return processingTimeDiff; - } - - /** - * Returns the time used for reading the task. - * - * @return processing time reading - */ - public long getProcessingTimeRead() { - return processingTimeRead; - } - - /** - * Returns the time used for the sql encoding. - * - * @return processing time encoding - */ - public long getProcessingTimeSQL() { - return processingTimeSQL; - } - - /** - * Returns the number of parsed revisions. - * - * @return number of parsed revisions - */ - public int getReadRevisionCounter() { - return readRevisionCounter; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() { - return revisionCounter; - } - - /** - * Sets the ID of the article. - * - * @param articleId Article ID - */ - public void setArticleId(final int articleId) { - this.articleId = articleId; - } - - /** - * Sets the name of the article. - * - * @param articleName Article name - */ - public void setArticleName(final String articleName) { - this.articleName = articleName; - } - - /** - * Sets the diffed size of the article. - * - * @param diffedSize diffed size - */ - public void setDiffedSize(final long diffedSize) { - this.diffedSize = diffedSize; - } - - /** - * Sets the number of diff parts. - * - * @param diffPartCounter number of diff parts - */ - public void setDiffPartCounter(final int diffPartCounter) { - this.diffPartCounter = diffPartCounter; - } - - /** - * Sets the encoded size of the article. - * - * @param encodedSize encoded size - */ - public void setEncodedSize(final long encodedSize) { - this.encodedSize = encodedSize; - } - - /** - * Sets the size of the article after the sql encoding. - * - * @param encodedSQLSize size after encoding - */ - public void setEncodedSQLSize(final long encodedSQLSize) { - this.encodedSQLSize = encodedSQLSize; - } - - /** - * Sets the entering time of the first task for this article. - * - * @param enteringTime entering time - */ - public void setEnteringTime(final long enteringTime) { - this.enteringTime = enteringTime; - } - - /** - * Sets the exiting time of the last task for this article. - * - * @param exitingTime exiting time - */ - public void setExitingTime(final long exitingTime) { - this.exitingTime = exitingTime; - } - - /** - * Sets the number of ignored revisions. - * - * @param ignoredRevisionsCounter number of ignored revisions - */ - public void setIgnoredRevisionsCounter(final int ignoredRevisionsCounter) { - this.ignoredRevisionsCounter = ignoredRevisionsCounter; - } - - /** - * Sets the original size of the article. - * - * @param originalSize original size - */ - public void setOriginalSize(final long originalSize) { - this.originalSize = originalSize; - } - - /** - * Sets the time used for the diff encoding. - * - * @param processingTimeDiff processing time diff - */ - public void setProcessingTimeDiff(final long processingTimeDiff) { - this.processingTimeDiff = processingTimeDiff; - } - - /** - * Sets the time used for reading the task. - * - * @param processingTimeRead processing time reading - */ - public void setProcessingTimeRead(final long processingTimeRead) { - this.processingTimeRead = processingTimeRead; - } - - /** - * Sets the time used for the sql encoding. - * - * @param processingTimeSQL processing time encoding - */ - public void setProcessingTimeSQL(final long processingTimeSQL) { - this.processingTimeSQL = processingTimeSQL; - } - - /** - * Sets the number of parsed revisions. - * - * @param readRevisionCounter number of parsed revisions - */ - public void setReadRevisionCounter(final int readRevisionCounter) { - this.readRevisionCounter = readRevisionCounter; - } - - /** - * Sets the revision counter. - * - * @param nrRevisions revision counter - */ - public void setRevisionCounter(final int nrRevisions) { - this.revisionCounter = nrRevisions; - } - - /** - * Returns the string representation of this object. Used for logging the - * statistical data. - * - * @return content representation - */ - public String toString() { - - long sysTime = this.exitingTime - this.enteringTime; - - StringBuilder b = new StringBuilder(); - b.append("\n[\tARTICLEID: \t"); - b.append(articleId); - b.append("\r\n\tARTICLENAME: \t"); - b.append(articleName); - b.append("\r\n\r\n\tNUMBER REVISIONS:\t["); - b.append(this.revisionCounter); - b.append(" + "); - b.append(this.ignoredRevisionsCounter); - b.append(" = "); - b.append(this.readRevisionCounter); - b.append("]\r\n\tNUMBER DIFFPARTS:\t"); - b.append(this.diffPartCounter); - b.append("\r\n\r\n\tSYSTEM TIME: \t[ 100% ]\t"); - b.append(Time.toClock(sysTime)); - b.append("\r\n\tREADING TIME: \t["); - b.append(MathUtilities.percentFrom(this.processingTimeRead, sysTime)); - b.append("]\t"); - b.append(Time.toClock(this.processingTimeRead)); - b.append("\r\n\tDIFFING TIME: \t["); - b.append(MathUtilities.percentFrom(this.processingTimeDiff, sysTime)); - b.append("]\t"); - b.append(Time.toClock(this.processingTimeDiff)); - b.append("\r\n\tENCODING TIME: \t["); - b.append(MathUtilities.percentFrom(this.processingTimeSQL, sysTime)); - b.append("]\t"); - b.append(Time.toClock(this.processingTimeSQL)); - b.append("\r\n\r\n\tORIGINAL SIZE: \t[ 100% ]\t"); - b.append(this.originalSize); - b.append("\r\n\tDIFFED SIZE: \t["); - b.append(MathUtilities.percentFrom(this.diffedSize, this.originalSize)); - b.append("]\t"); - b.append(this.diffedSize); - b.append("\r\n\tENCODED SIZE: \t["); - b.append(MathUtilities.percentFrom(this.encodedSize, this.originalSize)); - b.append("]\t"); - b.append(this.encodedSize); - b.append("\r\n\tENCODED UNCOMPRESSED SIZE: \t["); - b.append(MathUtilities.percentFrom(this.encodedSQLSize, - this.originalSize)); - b.append("]\t"); - b.append(this.encodedSQLSize); - b.append("\r\n]\r\n"); - - return b.toString(); - } +public class ArticleInformation +{ + + /** + * Article ID + */ + private int articleId; + + /** + * Name of the article + */ + private String articleName; + + /** + * Diffed size of the article + */ + private long diffedSize; + + /** + * Number of diff parts used + */ + private int diffPartCounter; + + /** + * Encoded size of the article + */ + private long encodedSize; + + /** + * UNCOMPRESSED encoded size of the article + */ + private long encodedSQLSize; + + /** + * Time the task entered the system + */ + private long enteringTime; + + /** + * Time the task exited the system + */ + private long exitingTime; + + /** + * Number of ignored revisions + */ + private int ignoredRevisionsCounter; + + /** + * Original size of the article + */ + private long originalSize; + + /** + * Time used to diff the task + */ + private long processingTimeDiff; + + /** + * Time used to read the task + */ + private long processingTimeRead; + + /** + * Time used to encode the task + */ + private long processingTimeSQL; + + /** + * Number of parsed revisions related to this article + */ + private int readRevisionCounter; + + /** + * Value of the revision counter after finishing the diff processing + */ + private int revisionCounter; + + /** + * (Constructor) Creates a new ArticleInformation object. + */ + public ArticleInformation() + { + this.articleId = -1; + // this.timeStamp = null; + this.articleName = null; + + this.revisionCounter = 0; + this.ignoredRevisionsCounter = 0; + this.diffPartCounter = 0; + + this.originalSize = 0; + this.diffedSize = 0; + this.encodedSize = 0; + this.encodedSQLSize = 0; + + this.enteringTime = 0; + this.exitingTime = 0; + } + + /** + * Returns the ID of the article. + * + * @return Article ID + */ + public int getArticleId() + { + return articleId; + } + + /** + * Returns the name of the article. + * + * @return Article name + */ + public String getArticleName() + { + return articleName; + } + + /** + * Returns the diffed size of the article. + * + * @return diffed size + */ + public long getDiffedSize() + { + return diffedSize; + } + + /** + * Returns the number of diff parts. + * + * @return number of diff parts + */ + public int getDiffPartCounter() + { + return diffPartCounter; + } + + /** + * Returns the encoded size of the article. + * + * @return encoded size + */ + public long getEncodedSize() + { + return encodedSize; + } + + /** + * Returns the size of the article after the sql encoding. + * + * @return size after encoding + */ + public long getEncodedSQLSize() + { + return encodedSQLSize; + } + + /** + * Returns the entering time. + * + * @return entering time + */ + public long getEnteringTime() + { + return enteringTime; + } + + /** + * Returns the exiting time. + * + * @return exiting time + */ + public long getExitingTime() + { + return exitingTime; + } + + /** + * Returns the number of ignored revisions. + * + * @return number of ignored revisions + */ + public int getIgnoredRevisionsCounter() + { + return ignoredRevisionsCounter; + } + + /** + * Returns the original size of the article. + * + * @return original size + */ + public long getOriginalSize() + { + return originalSize; + } + + /** + * Returns the time used for the diff encoding. + * + * @return processing time diff + */ + public long getProcessingTimeDiff() + { + return processingTimeDiff; + } + + /** + * Returns the time used for reading the task. + * + * @return processing time reading + */ + public long getProcessingTimeRead() + { + return processingTimeRead; + } + + /** + * Returns the time used for the sql encoding. + * + * @return processing time encoding + */ + public long getProcessingTimeSQL() + { + return processingTimeSQL; + } + + /** + * Returns the number of parsed revisions. + * + * @return number of parsed revisions + */ + public int getReadRevisionCounter() + { + return readRevisionCounter; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() + { + return revisionCounter; + } + + /** + * Sets the ID of the article. + * + * @param articleId + * Article ID + */ + public void setArticleId(final int articleId) + { + this.articleId = articleId; + } + + /** + * Sets the name of the article. + * + * @param articleName + * Article name + */ + public void setArticleName(final String articleName) + { + this.articleName = articleName; + } + + /** + * Sets the diffed size of the article. + * + * @param diffedSize + * diffed size + */ + public void setDiffedSize(final long diffedSize) + { + this.diffedSize = diffedSize; + } + + /** + * Sets the number of diff parts. + * + * @param diffPartCounter + * number of diff parts + */ + public void setDiffPartCounter(final int diffPartCounter) + { + this.diffPartCounter = diffPartCounter; + } + + /** + * Sets the encoded size of the article. + * + * @param encodedSize + * encoded size + */ + public void setEncodedSize(final long encodedSize) + { + this.encodedSize = encodedSize; + } + + /** + * Sets the size of the article after the sql encoding. + * + * @param encodedSQLSize + * size after encoding + */ + public void setEncodedSQLSize(final long encodedSQLSize) + { + this.encodedSQLSize = encodedSQLSize; + } + + /** + * Sets the entering time of the first task for this article. + * + * @param enteringTime + * entering time + */ + public void setEnteringTime(final long enteringTime) + { + this.enteringTime = enteringTime; + } + + /** + * Sets the exiting time of the last task for this article. + * + * @param exitingTime + * exiting time + */ + public void setExitingTime(final long exitingTime) + { + this.exitingTime = exitingTime; + } + + /** + * Sets the number of ignored revisions. + * + * @param ignoredRevisionsCounter + * number of ignored revisions + */ + public void setIgnoredRevisionsCounter(final int ignoredRevisionsCounter) + { + this.ignoredRevisionsCounter = ignoredRevisionsCounter; + } + + /** + * Sets the original size of the article. + * + * @param originalSize + * original size + */ + public void setOriginalSize(final long originalSize) + { + this.originalSize = originalSize; + } + + /** + * Sets the time used for the diff encoding. + * + * @param processingTimeDiff + * processing time diff + */ + public void setProcessingTimeDiff(final long processingTimeDiff) + { + this.processingTimeDiff = processingTimeDiff; + } + + /** + * Sets the time used for reading the task. + * + * @param processingTimeRead + * processing time reading + */ + public void setProcessingTimeRead(final long processingTimeRead) + { + this.processingTimeRead = processingTimeRead; + } + + /** + * Sets the time used for the sql encoding. + * + * @param processingTimeSQL + * processing time encoding + */ + public void setProcessingTimeSQL(final long processingTimeSQL) + { + this.processingTimeSQL = processingTimeSQL; + } + + /** + * Sets the number of parsed revisions. + * + * @param readRevisionCounter + * number of parsed revisions + */ + public void setReadRevisionCounter(final int readRevisionCounter) + { + this.readRevisionCounter = readRevisionCounter; + } + + /** + * Sets the revision counter. + * + * @param nrRevisions + * revision counter + */ + public void setRevisionCounter(final int nrRevisions) + { + this.revisionCounter = nrRevisions; + } + + /** + * Returns the string representation of this object. Used for logging the statistical data. + * + * @return content representation + */ + public String toString() + { + + long sysTime = this.exitingTime - this.enteringTime; + + StringBuilder b = new StringBuilder(); + b.append("\n[\tARTICLEID: \t"); + b.append(articleId); + b.append("\r\n\tARTICLENAME: \t"); + b.append(articleName); + b.append("\r\n\r\n\tNUMBER REVISIONS:\t["); + b.append(this.revisionCounter); + b.append(" + "); + b.append(this.ignoredRevisionsCounter); + b.append(" = "); + b.append(this.readRevisionCounter); + b.append("]\r\n\tNUMBER DIFFPARTS:\t"); + b.append(this.diffPartCounter); + b.append("\r\n\r\n\tSYSTEM TIME: \t[ 100% ]\t"); + b.append(Time.toClock(sysTime)); + b.append("\r\n\tREADING TIME: \t["); + b.append(MathUtilities.percentFrom(this.processingTimeRead, sysTime)); + b.append("]\t"); + b.append(Time.toClock(this.processingTimeRead)); + b.append("\r\n\tDIFFING TIME: \t["); + b.append(MathUtilities.percentFrom(this.processingTimeDiff, sysTime)); + b.append("]\t"); + b.append(Time.toClock(this.processingTimeDiff)); + b.append("\r\n\tENCODING TIME: \t["); + b.append(MathUtilities.percentFrom(this.processingTimeSQL, sysTime)); + b.append("]\t"); + b.append(Time.toClock(this.processingTimeSQL)); + b.append("\r\n\r\n\tORIGINAL SIZE: \t[ 100% ]\t"); + b.append(this.originalSize); + b.append("\r\n\tDIFFED SIZE: \t["); + b.append(MathUtilities.percentFrom(this.diffedSize, this.originalSize)); + b.append("]\t"); + b.append(this.diffedSize); + b.append("\r\n\tENCODED SIZE: \t["); + b.append(MathUtilities.percentFrom(this.encodedSize, this.originalSize)); + b.append("]\t"); + b.append(this.encodedSize); + b.append("\r\n\tENCODED UNCOMPRESSED SIZE: \t["); + b.append(MathUtilities.percentFrom(this.encodedSQLSize, this.originalSize)); + b.append("]\t"); + b.append(this.encodedSQLSize); + b.append("\r\n]\r\n"); + + return b.toString(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java index 222bfd28..232b8bda 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexGenerator.java @@ -34,181 +34,201 @@ /** * Generates the indices for the database. */ -public class IndexGenerator { - - /** - * Reference to the configuration - */ - private final RevisionAPIConfiguration config; - - /** - * Creates a new IndexGenerator object. - * - * @param config Reference to the configuration - */ - public IndexGenerator(final RevisionAPIConfiguration config) { - this.config = config; - } - - /** - * Starts the generation of the indices. - * - * @throws WikiApiException if an error occurs - */ - public void generate() throws WikiApiException { - Indexer data = null; - try { - data = new Indexer(config); - - System.out.println("GENERATING INDEX STARTED"); - - long bufferSize = config.getBufferSize(); - Revision rev; - long count = 0; - long last = 0, now, start = System.currentTimeMillis(); - - Iterator it = new IndexIterator(config); - while (it.hasNext()) { - - if (++count % bufferSize == 0) { - now = System.currentTimeMillis() - start; - System.out.println(Time.toClock(now) + "\t" + (now - last) - + "\tINDEXING " + count); - last = now; - } +public class IndexGenerator +{ + + /** + * Reference to the configuration + */ + private final RevisionAPIConfiguration config; + + /** + * Creates a new IndexGenerator object. + * + * @param config + * Reference to the configuration + */ + public IndexGenerator(final RevisionAPIConfiguration config) + { + this.config = config; + } + + /** + * Starts the generation of the indices. + * + * @throws WikiApiException + * if an error occurs + */ + public void generate() throws WikiApiException + { + Indexer data = null; + try { + data = new Indexer(config); + + System.out.println("GENERATING INDEX STARTED"); - rev = it.next(); - data.index(rev); - } + long bufferSize = config.getBufferSize(); + Revision rev; + long count = 0; + long last = 0, now, start = System.currentTimeMillis(); - System.out.println("GENERATING INDEX ENDED + (" - + Time.toClock(System.currentTimeMillis() - start) + ")"); + Iterator it = new IndexIterator(config); + while (it.hasNext()) { - } catch (Exception e) { + if (++count % bufferSize == 0) { + now = System.currentTimeMillis() - start; + System.out.println( + Time.toClock(now) + "\t" + (now - last) + "\tINDEXING " + count); + last = now; + } + + rev = it.next(); + data.index(rev); + } + + System.out.println("GENERATING INDEX ENDED + (" + + Time.toClock(System.currentTimeMillis() - start) + ")"); + + } + catch (Exception e) { - throw new WikiApiException(e); + throw new WikiApiException(e); - } finally { - if (data != null) { - data.close(); - } + } + finally { + if (data != null) { + data.close(); + } + } } - } - - /** - * Starts index generation using the database credentials in the - * properties file specified in args[0].
- * The properties file should have the following structure: - *

  • host=dbhost
  • - *
  • db=revisiondb
  • - *
  • user=username
  • - *
  • password=pwd
  • - *
  • output=outputFile
  • - *
  • writeDirectlyToDB=true|false (optional)
  • - *
  • charset=UTF8 (or others) (optional)
  • - *
  • buffer=15000 (optional)
  • - *
  • maxAllowedPackets=16760832 (optional)
- *
- * - * @param args allows only one entry that contains the path to the config file - */ - public static void main(String[] args) { - - if (args == null || args.length != 1) { - System.out.println(("You need to specify the database configuration file. \n" + - "It should contain the access credentials to you revision database in the following format: \n" + - " host=dbhost \n" + - " db=revisiondb \n" + - " user=username \n" + - " password=pwd \n" + - " output=outputFile \n" + - " outputDatabase=true|false (optional)\n" + - " outputDatafile=true|false (optional)\n" + - " charset=UTF8 (optional)\n" + - " buffer=15000 (optional)\n" + - " maxAllowedPackets=16760832 (optional)\n\n" + - " The default output mode is SQL Dump")); - throw new IllegalArgumentException(); - } else { - Properties props = load(args[0]); - - RevisionAPIConfiguration config = new RevisionAPIConfiguration(); - - config.setHost(props.getProperty("host")); - config.setDatabase(props.getProperty("db")); - config.setUser(props.getProperty("user")); - config.setPassword(props.getProperty("password")); - - String charset = props.getProperty("charset"); - String buffer = props.getProperty("buffer"); - String maxAllowedPackets = props.getProperty("maxAllowedPackets"); - - config.setCharacterSet(Objects.requireNonNullElse(charset, "UTF-8")); - - if (buffer != null) { - config.setBufferSize(Integer.parseInt(buffer)); - } else { - config.setBufferSize(15000); - } - - if (maxAllowedPackets != null) { - config.setMaxAllowedPacket(Long.parseLong(maxAllowedPackets)); - } else { - config.setMaxAllowedPacket(16 * 1024 * 1023); - } - - if (props.getProperty("outputDatabase") != null && Boolean.parseBoolean(props.getProperty("outputDatabase"))) { - config.setOutputType(OutputTypes.DATABASE); - } else if (props.getProperty("outputDatafile") != null && Boolean.parseBoolean(props.getProperty("outputDatafile"))) { - config.setOutputType(OutputTypes.DATAFILE); - } else { - config.setOutputType(OutputTypes.SQL); - } - - String output = props.getProperty("output"); - File outfile = new File(output); - if (outfile.isDirectory()) { - config.setOutputPath(output); - } else { - config.setOutputPath(outfile.getParentFile().getPath()); - } - - try { - new IndexGenerator(config).generate(); - } catch (Exception e) { - e.printStackTrace(); - } - - System.out.println("TERMINATED"); + + /** + * Starts index generation using the database credentials in the properties file specified in + * args[0].
+ * The properties file should have the following structure: + *
    + *
  • host=dbhost
  • + *
  • db=revisiondb
  • + *
  • user=username
  • + *
  • password=pwd
  • + *
  • output=outputFile
  • + *
  • writeDirectlyToDB=true|false (optional)
  • + *
  • charset=UTF8 (or others) (optional)
  • + *
  • buffer=15000 (optional)
  • + *
  • maxAllowedPackets=16760832 (optional)
  • + *
+ *
+ * + * @param args + * allows only one entry that contains the path to the config file + */ + public static void main(String[] args) + { + + if (args == null || args.length != 1) { + System.out.println(("You need to specify the database configuration file. \n" + + "It should contain the access credentials to you revision database in the following format: \n" + + " host=dbhost \n" + " db=revisiondb \n" + " user=username \n" + + " password=pwd \n" + " output=outputFile \n" + + " outputDatabase=true|false (optional)\n" + + " outputDatafile=true|false (optional)\n" + " charset=UTF8 (optional)\n" + + " buffer=15000 (optional)\n" + " maxAllowedPackets=16760832 (optional)\n\n" + + " The default output mode is SQL Dump")); + throw new IllegalArgumentException(); + } + else { + Properties props = load(args[0]); + + RevisionAPIConfiguration config = new RevisionAPIConfiguration(); + + config.setHost(props.getProperty("host")); + config.setDatabase(props.getProperty("db")); + config.setUser(props.getProperty("user")); + config.setPassword(props.getProperty("password")); + + String charset = props.getProperty("charset"); + String buffer = props.getProperty("buffer"); + String maxAllowedPackets = props.getProperty("maxAllowedPackets"); + + config.setCharacterSet(Objects.requireNonNullElse(charset, "UTF-8")); + + if (buffer != null) { + config.setBufferSize(Integer.parseInt(buffer)); + } + else { + config.setBufferSize(15000); + } + + if (maxAllowedPackets != null) { + config.setMaxAllowedPacket(Long.parseLong(maxAllowedPackets)); + } + else { + config.setMaxAllowedPacket(16 * 1024 * 1023); + } + + if (props.getProperty("outputDatabase") != null + && Boolean.parseBoolean(props.getProperty("outputDatabase"))) { + config.setOutputType(OutputTypes.DATABASE); + } + else if (props.getProperty("outputDatafile") != null + && Boolean.parseBoolean(props.getProperty("outputDatafile"))) { + config.setOutputType(OutputTypes.DATAFILE); + } + else { + config.setOutputType(OutputTypes.SQL); + } + + String output = props.getProperty("output"); + File outfile = new File(output); + if (outfile.isDirectory()) { + config.setOutputPath(output); + } + else { + config.setOutputPath(outfile.getParentFile().getPath()); + } + + try { + new IndexGenerator(config).generate(); + } + catch (Exception e) { + e.printStackTrace(); + } + + System.out.println("TERMINATED"); + } } - } - - /** - * Load a properties file from the classpath - * - * @param configFilePath path to the configuration file - * @return Properties the properties object containing the configuration - * data - */ - private static Properties load(String configFilePath) { - Properties props = new Properties(); - BufferedInputStream fis = null; - try { - File configFile = new File(configFilePath); - fis = new BufferedInputStream(new FileInputStream(configFile)); - props.load(fis); - } catch (IOException e) { - System.err.println("Could not load configuration file " + configFilePath); - } finally { - if (fis != null) { + + /** + * Load a properties file from the classpath + * + * @param configFilePath + * path to the configuration file + * @return Properties the properties object containing the configuration data + */ + private static Properties load(String configFilePath) + { + Properties props = new Properties(); + BufferedInputStream fis = null; try { - fis.close(); - } catch (IOException e) { - System.err.println("Error closing file stream of configuration file " + configFilePath); + File configFile = new File(configFilePath); + fis = new BufferedInputStream(new FileInputStream(configFile)); + props.load(fis); + } + catch (IOException e) { + System.err.println("Could not load configuration file " + configFilePath); + } + finally { + if (fis != null) { + try { + fis.close(); + } + catch (IOException e) { + System.err.println( + "Error closing file stream of configuration file " + configFilePath); + } + } } - } + return props; } - return props; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java index 4969a33d..73f9c1cb 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/IndexIterator.java @@ -30,148 +30,159 @@ import org.dkpro.jwpl.revisionmachine.api.RevisionAPIConfiguration; /** - * Iterates over the database to retrieve the necessary information for the - * index generation. + * Iterates over the database to retrieve the necessary information for the index generation. */ -public class IndexIterator implements Iterator { - - /** - * Reference to the database connection - */ - private final Connection connection; - - /** - * Reference to the ResultSet - */ - private ResultSet result; - - /** - * Reference to the statement - */ - private Statement statement; - - /** - * Currently used primary kes - */ - private int primaryKey; - - /** - * Configuration parameter - maximum size of a result set - */ - private final int MAX_NUMBER_RESULTS; - - /** - * Creates the IndexIterator object. - * - * @param config Reference to the configuration - * @throws WikiApiException if an error occurs - */ - public IndexIterator(final RevisionAPIConfiguration config) throws WikiApiException { - - try { - this.primaryKey = -1; - - this.statement = null; - this.result = null; - - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); - - MAX_NUMBER_RESULTS = config.getBufferSize(); - - this.connection = DriverManager.getConnection("jdbc:mysql://" - + config.getHost() + "/" + config.getDatabase(), - config.getUser(), config.getPassword()); - - } catch (SQLException | ClassNotFoundException e) { - throw new WikiApiException(e); +public class IndexIterator + implements Iterator +{ + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Reference to the ResultSet + */ + private ResultSet result; + + /** + * Reference to the statement + */ + private Statement statement; + + /** + * Currently used primary kes + */ + private int primaryKey; + + /** + * Configuration parameter - maximum size of a result set + */ + private final int MAX_NUMBER_RESULTS; + + /** + * Creates the IndexIterator object. + * + * @param config + * Reference to the configuration + * @throws WikiApiException + * if an error occurs + */ + public IndexIterator(final RevisionAPIConfiguration config) throws WikiApiException + { + + try { + this.primaryKey = -1; + + this.statement = null; + this.result = null; + + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + MAX_NUMBER_RESULTS = config.getBufferSize(); + + this.connection = DriverManager.getConnection( + "jdbc:mysql://" + config.getHost() + "/" + config.getDatabase(), + config.getUser(), config.getPassword()); + + } + catch (SQLException | ClassNotFoundException e) { + throw new WikiApiException(e); + } } - } - - /** - * Queries the database for more revision information. - * - * @return TRUE if the result set contains elements FALSE otherwise - * @throws SQLException if an error occurs while accessing the database - */ - private boolean query() throws SQLException { - statement = this.connection.createStatement(); - - String query = "SELECT PrimaryKey, RevisionCounter," - + " RevisionID, ArticleID, Timestamp, FullRevisionID FROM revisions"; - - if (primaryKey > 0) { - query += " WHERE PrimaryKey > " + primaryKey; + + /** + * Queries the database for more revision information. + * + * @return TRUE if the result set contains elements FALSE otherwise + * @throws SQLException + * if an error occurs while accessing the database + */ + private boolean query() throws SQLException + { + statement = this.connection.createStatement(); + + String query = "SELECT PrimaryKey, RevisionCounter," + + " RevisionID, ArticleID, Timestamp, FullRevisionID FROM revisions"; + + if (primaryKey > 0) { + query += " WHERE PrimaryKey > " + primaryKey; + } + + if (MAX_NUMBER_RESULTS > 0) { + query += " LIMIT " + MAX_NUMBER_RESULTS; + } + + result = statement.executeQuery(query); + return result.next(); } - if (MAX_NUMBER_RESULTS > 0) { - query += " LIMIT " + MAX_NUMBER_RESULTS; + /** + * Returns the next revision information. (Does not contain the encoded diff) + * + * @return Revision + */ + @Override + public Revision next() + { + try { + Revision revision = new Revision(result.getInt(2)); + + this.primaryKey = result.getInt(1); + revision.setPrimaryKey(this.primaryKey); + + revision.setRevisionID(result.getInt(3)); + revision.setArticleID(result.getInt(4)); + revision.setTimeStamp(new Timestamp(result.getLong(5))); + revision.setFullRevisionID(result.getInt(6)); + + return revision; + + } + catch (Exception e) { + throw new RuntimeException(e); + } } - result = statement.executeQuery(query); - return result.next(); - } - - /** - * Returns the next revision information. (Does not contain the encoded - * diff) - * - * @return Revision - */ - @Override - public Revision next() { - try { - Revision revision = new Revision(result.getInt(2)); - - this.primaryKey = result.getInt(1); - revision.setPrimaryKey(this.primaryKey); - - revision.setRevisionID(result.getInt(3)); - revision.setArticleID(result.getInt(4)); - revision.setTimeStamp(new Timestamp(result.getLong(5))); - revision.setFullRevisionID(result.getInt(6)); - - return revision; - - } catch (Exception e) { - throw new RuntimeException(e); + /** + * Returns TRUE if another revision information is available. + * + * @return TRUE | FALSE + */ + @Override + public boolean hasNext() + { + try { + if (result != null && result.next()) { + return true; + } + + if (this.statement != null) { + this.statement.close(); + } + if (this.result != null) { + this.result.close(); + } + + return query(); + + } + catch (SQLException e) { + throw new RuntimeException(e); + } } - } - - /** - * Returns TRUE if another revision information is available. - * - * @return TRUE | FALSE - */ - @Override - public boolean hasNext() { - try { - if (result != null && result.next()) { - return true; - } - - if (this.statement != null) { - this.statement.close(); - } - if (this.result != null) { - this.result.close(); - } - - return query(); - - } catch (SQLException e) { - throw new RuntimeException(e); + + /** + * unsupported method + * + * @throws UnsupportedOperationException + * @deprecated Don't cal this method as it will throw an exception at runtime. + */ + @Deprecated(since = "1.1") + public void remove() + { + throw new UnsupportedOperationException(); } - } - - /** - * unsupported method - * - * @throws UnsupportedOperationException - * @deprecated Don't cal this method as it will throw an exception at runtime. - */ - @Deprecated(since = "1.1") - public void remove() { - throw new UnsupportedOperationException(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java index 521e556d..a56185ae 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/Indexer.java @@ -36,240 +36,252 @@ import org.dkpro.jwpl.revisionmachine.index.writer.SQLFileWriter; /** - * Forwards the necessary information to the AbstractIndex classes and controls - * the writing to the output if one of the index has reached the maximum size. + * Forwards the necessary information to the AbstractIndex classes and controls the writing to the + * output if one of the index has reached the maximum size. */ -public class Indexer { - - /** - * Currently used article - */ - private int currentArticleID; - - /** - * First appearance of the current article - */ - private long startTime; - - /** - * Last appearance of the current article - */ - private long endTime; - - /** - * Currently used full revision - */ - private int currentFullRevisionID; - - /** - * Previous revision - */ - private Revision lastRev; - - /** - * Reference to the revision index - */ - private RevisionIndex revisionIndex = null; - - /** - * Reference to the currently used article index information - */ - private ArticleIndexData info; - - /** - * List of article index information related to the currently used article - */ - private final List infoList; - - /** - * Reference to the article index - */ - private ArticleIndex articleIndex = null; - - /** - * Reference to the chronological order index - */ - private ChronoIndex chronoIndex = null; - - /** - * Reference to the output writer - */ - private IndexWriterInterface indexWriter; - - /** - * Reference to the database connection - */ - private final Connection connection = null; - - /** - * (Constructor) Creates a Index object. - * - * @param config Reference to the configuration - * @throws ClassNotFoundException if the jdbc classes could not be located - * @throws SQLException if an error occurred while accessing the database - * @throws IOException if an error occurred while writing the output - */ - public Indexer(final RevisionAPIConfiguration config) - throws ClassNotFoundException, SQLException, IOException { - - this.currentArticleID = -1; - - switch (config.getOutputType()) { - case DATABASE: - case SQL: - //Indices with SQL statements - this.revisionIndex = new RevisionIndex(config.getMaxAllowedPacket()); - this.articleIndex = new ArticleIndex(config.getMaxAllowedPacket()); - this.chronoIndex = new ChronoIndex(config.getMaxAllowedPacket()); - break; - case DATAFILE: - //Indices without SQL statements - this.revisionIndex = new RevisionIndex(); - this.articleIndex = new ArticleIndex(); - this.chronoIndex = new ChronoIndex(); - break; - } - - this.infoList = new ArrayList<>(); - - switch (config.getOutputType()) { - case DATABASE: - this.indexWriter = new DatabaseWriter(config); - break; - case SQL: - this.indexWriter = new SQLFileWriter(config); - break; - case DATAFILE: - this.indexWriter = new DataFileWriter(config); - break; - } - } - - /** - * Checks whether the AbstractIndex classes have output available and - * forward them to the output writer. - * - * @throws IOException if an error occurred while writing the output - * @throws SQLException if an error occurred while accessing the database - */ - private void send() - throws IOException, SQLException { - - this.indexWriter.write(articleIndex); - this.indexWriter.write(revisionIndex); - this.indexWriter.write(chronoIndex); - } - - /** - * Processes the given revision. - * - * @param rev Reference to a revision - * @throws WikiApiException if an error occurs - */ - public void index(final Revision rev) throws WikiApiException { - - int articleID = rev.getArticleID(); - int fullRevisionID = rev.getFullRevisionID(); - int revisionCounter = rev.getRevisionCounter(); - - if (articleID != currentArticleID) { - - if (lastRev != null) { - info.setEndRevisionCount(lastRev.getRevisionCounter()); - this.infoList.add(info); - - try { - this.articleIndex.add(currentArticleID, startTime, endTime, - infoList); - send(); - } catch (SQLException | IOException sql) { - sql.printStackTrace(); - throw new WikiApiException(sql); +public class Indexer +{ + + /** + * Currently used article + */ + private int currentArticleID; + + /** + * First appearance of the current article + */ + private long startTime; + + /** + * Last appearance of the current article + */ + private long endTime; + + /** + * Currently used full revision + */ + private int currentFullRevisionID; + + /** + * Previous revision + */ + private Revision lastRev; + + /** + * Reference to the revision index + */ + private RevisionIndex revisionIndex = null; + + /** + * Reference to the currently used article index information + */ + private ArticleIndexData info; + + /** + * List of article index information related to the currently used article + */ + private final List infoList; + + /** + * Reference to the article index + */ + private ArticleIndex articleIndex = null; + + /** + * Reference to the chronological order index + */ + private ChronoIndex chronoIndex = null; + + /** + * Reference to the output writer + */ + private IndexWriterInterface indexWriter; + + /** + * Reference to the database connection + */ + private final Connection connection = null; + + /** + * (Constructor) Creates a Index object. + * + * @param config + * Reference to the configuration + * @throws ClassNotFoundException + * if the jdbc classes could not be located + * @throws SQLException + * if an error occurred while accessing the database + * @throws IOException + * if an error occurred while writing the output + */ + public Indexer(final RevisionAPIConfiguration config) + throws ClassNotFoundException, SQLException, IOException + { + + this.currentArticleID = -1; + + switch (config.getOutputType()) { + case DATABASE: + case SQL: + // Indices with SQL statements + this.revisionIndex = new RevisionIndex(config.getMaxAllowedPacket()); + this.articleIndex = new ArticleIndex(config.getMaxAllowedPacket()); + this.chronoIndex = new ChronoIndex(config.getMaxAllowedPacket()); + break; + case DATAFILE: + // Indices without SQL statements + this.revisionIndex = new RevisionIndex(); + this.articleIndex = new ArticleIndex(); + this.chronoIndex = new ChronoIndex(); + break; } - } - if (revisionCounter != 1) { - System.err.println("WARNING : ArticleID (" + articleID - + ") RevisionCounter 1 expected - " + revisionCounter - + " read"); - } + this.infoList = new ArrayList<>(); + + switch (config.getOutputType()) { + case DATABASE: + this.indexWriter = new DatabaseWriter(config); + break; + case SQL: + this.indexWriter = new SQLFileWriter(config); + break; + case DATAFILE: + this.indexWriter = new DataFileWriter(config); + break; + } + } - startTime = Long.MAX_VALUE; - endTime = Long.MIN_VALUE; + /** + * Checks whether the AbstractIndex classes have output available and forward them to the output + * writer. + * + * @throws IOException + * if an error occurred while writing the output + * @throws SQLException + * if an error occurred while accessing the database + */ + private void send() throws IOException, SQLException + { + + this.indexWriter.write(articleIndex); + this.indexWriter.write(revisionIndex); + this.indexWriter.write(chronoIndex); + } - currentArticleID = articleID; - currentFullRevisionID = fullRevisionID; + /** + * Processes the given revision. + * + * @param rev + * Reference to a revision + * @throws WikiApiException + * if an error occurs + */ + public void index(final Revision rev) throws WikiApiException + { + + int articleID = rev.getArticleID(); + int fullRevisionID = rev.getFullRevisionID(); + int revisionCounter = rev.getRevisionCounter(); + + if (articleID != currentArticleID) { + + if (lastRev != null) { + info.setEndRevisionCount(lastRev.getRevisionCounter()); + this.infoList.add(info); + + try { + this.articleIndex.add(currentArticleID, startTime, endTime, infoList); + send(); + } + catch (SQLException | IOException sql) { + sql.printStackTrace(); + throw new WikiApiException(sql); + } + } + + if (revisionCounter != 1) { + System.err.println("WARNING : ArticleID (" + articleID + + ") RevisionCounter 1 expected - " + revisionCounter + " read"); + } + + startTime = Long.MAX_VALUE; + endTime = Long.MIN_VALUE; + + currentArticleID = articleID; + currentFullRevisionID = fullRevisionID; + + info = new ArticleIndexData(); + + info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); + info.setStartRevisionCount(rev.getRevisionCounter()); - info = new ArticleIndexData(); + } + else if (fullRevisionID != currentFullRevisionID) { - info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); - info.setStartRevisionCount(rev.getRevisionCounter()); + if (lastRev.getRevisionCounter() + 1 != revisionCounter) { + System.err.println("WARNING : ArticleID (" + articleID + ")" + " RevisionCounter " + + (lastRev.getRevisionCounter() + 1) + " expected - " + revisionCounter + + " read"); + } - } else if (fullRevisionID != currentFullRevisionID) { + info.setEndRevisionCount(lastRev.getRevisionCounter()); + this.infoList.add(info); - if (lastRev.getRevisionCounter() + 1 != revisionCounter) { - System.err.println("WARNING : ArticleID (" + articleID + ")" - + " RevisionCounter " - + (lastRev.getRevisionCounter() + 1) + " expected - " - + revisionCounter + " read"); - } + currentFullRevisionID = fullRevisionID; + info = new ArticleIndexData(); - info.setEndRevisionCount(lastRev.getRevisionCounter()); - this.infoList.add(info); + info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); + info.setStartRevisionCount(rev.getRevisionCounter()); - currentFullRevisionID = fullRevisionID; - info = new ArticleIndexData(); + } + else if (lastRev.getRevisionCounter() + 1 != revisionCounter) { - info.setFullRevisionPrimaryKey(rev.getPrimaryKey()); - info.setStartRevisionCount(rev.getRevisionCounter()); + System.err.println("WARNING : ArticleID (" + articleID + ")" + " RevisionCounter " + + (lastRev.getRevisionCounter() + 1) + " expected - " + revisionCounter + + " read"); + } - } else if (lastRev.getRevisionCounter() + 1 != revisionCounter) { + this.startTime = Math.min(rev.getTimeStamp().getTime(), startTime); + this.endTime = Math.max(rev.getTimeStamp().getTime(), endTime); - System.err.println("WARNING : ArticleID (" + articleID + ")" - + " RevisionCounter " + (lastRev.getRevisionCounter() + 1) - + " expected - " + revisionCounter + " read"); + revisionIndex.add(rev.getRevisionID(), rev.getPrimaryKey(), + info.getFullRevisionPrimaryKey()); + chronoIndex.add(articleID, rev.getRevisionCounter(), rev.getTimeStamp().getTime()); + lastRev = rev; } - this.startTime = Math.min(rev.getTimeStamp().getTime(), startTime); - this.endTime = Math.max(rev.getTimeStamp().getTime(), endTime); - - revisionIndex.add(rev.getRevisionID(), rev.getPrimaryKey(), - info.getFullRevisionPrimaryKey()); - chronoIndex.add(articleID, rev.getRevisionCounter(), rev.getTimeStamp() - .getTime()); - lastRev = rev; - } + /** + * Finalizes the indices and sends the rest of the data to the output. Afterwards the database + * connection will be closed. + * + * @throws WikiApiException + * if an error occurs + */ + public void close() throws WikiApiException + { - /** - * Finalizes the indices and sends the rest of the data to the output. - * Afterwards the database connection will be closed. - * - * @throws WikiApiException if an error occurs - */ - public void close() throws WikiApiException { - - try { - this.revisionIndex.finalizeIndex(); - this.chronoIndex.finalizeIndex(); + try { + this.revisionIndex.finalizeIndex(); + this.chronoIndex.finalizeIndex(); - info.setEndRevisionCount(lastRev.getRevisionCounter()); - this.infoList.add(info); + info.setEndRevisionCount(lastRev.getRevisionCounter()); + this.infoList.add(info); - this.articleIndex.add(currentArticleID, startTime, endTime, - infoList); - this.articleIndex.finalizeIndex(); + this.articleIndex.add(currentArticleID, startTime, endTime, infoList); + this.articleIndex.finalizeIndex(); - send(); + send(); - this.indexWriter.finish(); + this.indexWriter.finish(); - if (connection != null) { - this.connection.close(); - } + if (connection != null) { + this.connection.close(); + } - } catch (SQLException | IOException sql) { - sql.printStackTrace(); - throw new WikiApiException(sql); + } + catch (SQLException | IOException sql) { + sql.printStackTrace(); + throw new WikiApiException(sql); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java index a65cd4fd..1aca1299 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/AbstractIndex.java @@ -23,113 +23,122 @@ /** * This class represents an abstract index. */ -public abstract class AbstractIndex { - - /** - * Current query buffer - */ - protected StringBuilder buffer; - - /** - * List of contained queries. - */ - private final List bufferList; - - /** - * Insert Statement to use - */ - protected final String insertStatement; - - /** - * MAX_ALLOWED_PACKET - */ - protected long MAX_ALLOWED_PACKET; - - /** - * (Constructor) Creates an index object. - */ - public AbstractIndex() { - - this.bufferList = new ArrayList<>(); - this.buffer = null; - - //does not really matter here- should be big to speed up data file creation - this.MAX_ALLOWED_PACKET = 16760832; - - this.insertStatement = ""; - - storeBuffer(); - } - - /** - * (Constructor) Creates an index object. - * - * @param insertStatement Insert Statement - * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET - */ - public AbstractIndex(final String insertStatement, - final long MAX_ALLOWED_PACKET) { - - this.bufferList = new ArrayList<>(); - this.buffer = null; - - this.MAX_ALLOWED_PACKET = MAX_ALLOWED_PACKET; - - this.insertStatement = insertStatement; - - storeBuffer(); - } - - /** - * Returns the size of the currently used buffer. - * - * @return size of current query - */ - public int byteSize() { - return this.buffer.length(); - } - - /** - * Finalizes the query in the currently used buffer and creates a new one. - * The finalized query will be added to the list of queries. - */ - public void finalizeIndex() { - storeBuffer(); - } - - /** - * Removes a query from the list of queries. - * - * @return Buffer containing a finalized query - */ - public StringBuilder remove() { - return this.bufferList.remove(0); - } - - /** - * Returns the current number of buffered queries. - * - * @return size of the list of queries - */ - public int size() { - return bufferList.size(); - } - - /** - * Finalizes the query in the currently used buffer and creates a new one. - * The finalized query will be added to the list of queries. - */ - protected void storeBuffer() { - - if (buffer != null && buffer.length() > insertStatement.length()) { - if (!insertStatement.isEmpty()) { - //only do this in SQL/DATABASE MODE - this.buffer.append(";"); - } - bufferList.add(buffer); +public abstract class AbstractIndex +{ + + /** + * Current query buffer + */ + protected StringBuilder buffer; + + /** + * List of contained queries. + */ + private final List bufferList; + + /** + * Insert Statement to use + */ + protected final String insertStatement; + + /** + * MAX_ALLOWED_PACKET + */ + protected long MAX_ALLOWED_PACKET; + + /** + * (Constructor) Creates an index object. + */ + public AbstractIndex() + { + + this.bufferList = new ArrayList<>(); + this.buffer = null; + + // does not really matter here- should be big to speed up data file creation + this.MAX_ALLOWED_PACKET = 16760832; + + this.insertStatement = ""; + + storeBuffer(); + } + + /** + * (Constructor) Creates an index object. + * + * @param insertStatement + * Insert Statement + * @param MAX_ALLOWED_PACKET + * MAX_ALLOWED_PACKET + */ + public AbstractIndex(final String insertStatement, final long MAX_ALLOWED_PACKET) + { + + this.bufferList = new ArrayList<>(); + this.buffer = null; + + this.MAX_ALLOWED_PACKET = MAX_ALLOWED_PACKET; + + this.insertStatement = insertStatement; + + storeBuffer(); + } + + /** + * Returns the size of the currently used buffer. + * + * @return size of current query + */ + public int byteSize() + { + return this.buffer.length(); + } + + /** + * Finalizes the query in the currently used buffer and creates a new one. The finalized query + * will be added to the list of queries. + */ + public void finalizeIndex() + { + storeBuffer(); + } + + /** + * Removes a query from the list of queries. + * + * @return Buffer containing a finalized query + */ + public StringBuilder remove() + { + return this.bufferList.remove(0); } - this.buffer = new StringBuilder(); - this.buffer.append(insertStatement); - } + /** + * Returns the current number of buffered queries. + * + * @return size of the list of queries + */ + public int size() + { + return bufferList.size(); + } + + /** + * Finalizes the query in the currently used buffer and creates a new one. The finalized query + * will be added to the list of queries. + */ + protected void storeBuffer() + { + + if (buffer != null && buffer.length() > insertStatement.length()) { + if (!insertStatement.isEmpty()) { + // only do this in SQL/DATABASE MODE + this.buffer.append(";"); + } + bufferList.add(buffer); + } + + this.buffer = new StringBuilder(); + this.buffer.append(insertStatement); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java index d45a946c..c55b1506 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndex.java @@ -22,93 +22,103 @@ /** * Index for article information. */ -public class ArticleIndex extends AbstractIndex { - /** - * Creates a new ArticleIndex object. - */ - public ArticleIndex() { - - super(); - } - - /** - * Creates a new ArticleIndex object. - * - * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET - */ - public ArticleIndex(final long MAX_ALLOWED_PACKET) { - - super("INSERT INTO index_articleID_rc_ts VALUES ", MAX_ALLOWED_PACKET); - } - - /** - * Adds the information for an new entry in the article index. - * - * @param currentArticleID ID of the currently used article - * @param startTime First date of appearance - * @param endTime Last date of appearance - * @param infoList List of revision blocks - */ - public void add(final int currentArticleID, final long startTime, - final long endTime, final List infoList) { - - // index_articleID_rc_ts - if (!infoList.isEmpty()) { - - StringBuilder fullRevBuffer = new StringBuilder(); - StringBuilder revCountBuffer = new StringBuilder(); - - boolean first = true; - ArticleIndexData info; - while (!infoList.isEmpty()) { - - info = infoList.remove(0); - - if (!first) { - fullRevBuffer.append(" "); - revCountBuffer.append(" "); - } - - fullRevBuffer.append(info.getFullRevisionPrimaryKey()); - - revCountBuffer.append(info.getStartRevisionCount()); - revCountBuffer.append(" "); - revCountBuffer.append(info.getEndRevisionCount()); - - first = false; - } +public class ArticleIndex + extends AbstractIndex +{ + /** + * Creates a new ArticleIndex object. + */ + public ArticleIndex() + { + + super(); + } - boolean sql = !insertStatement.isEmpty(); - if (buffer.length() + fullRevBuffer.length() - + revCountBuffer.length() + 20 >= MAX_ALLOWED_PACKET) { - storeBuffer(); - } + /** + * Creates a new ArticleIndex object. + * + * @param MAX_ALLOWED_PACKET + * MAX_ALLOWED_PACKET + */ + public ArticleIndex(final long MAX_ALLOWED_PACKET) + { + super("INSERT INTO index_articleID_rc_ts VALUES ", MAX_ALLOWED_PACKET); + } - if (sql) { - if (buffer.length() > insertStatement.length()) { - buffer.append(","); + /** + * Adds the information for an new entry in the article index. + * + * @param currentArticleID + * ID of the currently used article + * @param startTime + * First date of appearance + * @param endTime + * Last date of appearance + * @param infoList + * List of revision blocks + */ + public void add(final int currentArticleID, final long startTime, final long endTime, + final List infoList) + { + + // index_articleID_rc_ts + if (!infoList.isEmpty()) { + + StringBuilder fullRevBuffer = new StringBuilder(); + StringBuilder revCountBuffer = new StringBuilder(); + + boolean first = true; + ArticleIndexData info; + while (!infoList.isEmpty()) { + + info = infoList.remove(0); + + if (!first) { + fullRevBuffer.append(" "); + revCountBuffer.append(" "); + } + + fullRevBuffer.append(info.getFullRevisionPrimaryKey()); + + revCountBuffer.append(info.getStartRevisionCount()); + revCountBuffer.append(" "); + revCountBuffer.append(info.getEndRevisionCount()); + + first = false; + } + + boolean sql = !insertStatement.isEmpty(); + if (buffer.length() + fullRevBuffer.length() + revCountBuffer.length() + + 20 >= MAX_ALLOWED_PACKET) { + storeBuffer(); + } + + if (sql) { + if (buffer.length() > insertStatement.length()) { + buffer.append(","); + } + buffer.append("("); + } + buffer.append(currentArticleID); + buffer.append(","); + buffer.append(sql ? "\'" : "\""); + buffer.append(fullRevBuffer); + buffer.append(sql ? "\'" : "\""); + buffer.append(","); + buffer.append(sql ? "\'" : "\""); + buffer.append(revCountBuffer); + buffer.append(sql ? "\'" : "\""); + buffer.append(","); + buffer.append(startTime); + buffer.append(","); + buffer.append(endTime); + if (sql) { + buffer.append(")"); + } + else { + buffer.append("\n"); + } } - buffer.append("("); - } - buffer.append(currentArticleID); - buffer.append(","); - buffer.append(sql ? "\'" : "\""); - buffer.append(fullRevBuffer); - buffer.append(sql ? "\'" : "\""); - buffer.append(","); - buffer.append(sql ? "\'" : "\""); - buffer.append(revCountBuffer); - buffer.append(sql ? "\'" : "\""); - buffer.append(","); - buffer.append(startTime); - buffer.append(","); - buffer.append(endTime); - if (sql) { - buffer.append(")"); - } else { - buffer.append("\n"); - } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java index 7aaa61c3..878cd406 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ArticleIndexData.java @@ -18,100 +18,113 @@ package org.dkpro.jwpl.revisionmachine.index.indices; /** - * This class represents the data used by the ArticleIndex. One objects - * represents one revision block. + * This class represents the data used by the ArticleIndex. One objects represents one revision + * block. */ -public class ArticleIndexData { +public class ArticleIndexData +{ - /** - * Last number of a block of revisions - */ - private long endRevisionCount; + /** + * Last number of a block of revisions + */ + private long endRevisionCount; - /** - * ID of the full revision - */ - private long fullRevisionID; + /** + * ID of the full revision + */ + private long fullRevisionID; - /** - * PK of the full revision - */ - private long fullRevisionPrimaryKey; + /** + * PK of the full revision + */ + private long fullRevisionPrimaryKey; - /** - * First number of a block of revisions - */ - private long startRevisionCount; + /** + * First number of a block of revisions + */ + private long startRevisionCount; - /** - * Returns the last revision counter of this block. - * - * @return revision counter - */ - public long getEndRevisionCount() { - return endRevisionCount; - } + /** + * Returns the last revision counter of this block. + * + * @return revision counter + */ + public long getEndRevisionCount() + { + return endRevisionCount; + } - /** - * Returns the ID of the full revision. - * - * @return ID of the full revision - */ - public long getFullRevisionID() { - return fullRevisionID; - } + /** + * Returns the ID of the full revision. + * + * @return ID of the full revision + */ + public long getFullRevisionID() + { + return fullRevisionID; + } - /** - * Returns the PK of the full revision. - * - * @return PK of the full revision - */ - public long getFullRevisionPrimaryKey() { - return fullRevisionPrimaryKey; - } + /** + * Returns the PK of the full revision. + * + * @return PK of the full revision + */ + public long getFullRevisionPrimaryKey() + { + return fullRevisionPrimaryKey; + } - /** - * Returns the first revision counter of this block. - * - * @return revision counter - */ - public long getStartRevisionCount() { - return startRevisionCount; - } + /** + * Returns the first revision counter of this block. + * + * @return revision counter + */ + public long getStartRevisionCount() + { + return startRevisionCount; + } - /** - * Sets the last revision counter of this block. - * - * @param endRevisionCount revision counter - */ - public void setEndRevisionCount(final long endRevisionCount) { - this.endRevisionCount = endRevisionCount; - } + /** + * Sets the last revision counter of this block. + * + * @param endRevisionCount + * revision counter + */ + public void setEndRevisionCount(final long endRevisionCount) + { + this.endRevisionCount = endRevisionCount; + } - /** - * Sets the ID of the full revision. - * - * @param fullRevisionID ID of the full revision - */ - public void setFullRevisionID(final long fullRevisionID) { - this.fullRevisionID = fullRevisionID; - } + /** + * Sets the ID of the full revision. + * + * @param fullRevisionID + * ID of the full revision + */ + public void setFullRevisionID(final long fullRevisionID) + { + this.fullRevisionID = fullRevisionID; + } - /** - * Sets the PK of the full revision. - * - * @param fullRevisionPrimaryKey PK of the full revision - */ - public void setFullRevisionPrimaryKey(final long fullRevisionPrimaryKey) { - this.fullRevisionPrimaryKey = fullRevisionPrimaryKey; - } + /** + * Sets the PK of the full revision. + * + * @param fullRevisionPrimaryKey + * PK of the full revision + */ + public void setFullRevisionPrimaryKey(final long fullRevisionPrimaryKey) + { + this.fullRevisionPrimaryKey = fullRevisionPrimaryKey; + } - /** - * Sets the first revision counter of this block. - * - * @param startRevisionCount revision counter - */ - public void setStartRevisionCount(final long startRevisionCount) { - this.startRevisionCount = startRevisionCount; - } + /** + * Sets the first revision counter of this block. + * + * @param startRevisionCount + * revision counter + */ + public void setStartRevisionCount(final long startRevisionCount) + { + this.startRevisionCount = startRevisionCount; + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java index b79131c2..c351ef55 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndex.java @@ -24,149 +24,158 @@ /** * Index for the correct chronological order of revisions. */ -public class ChronoIndex extends AbstractIndex { +public class ChronoIndex + extends AbstractIndex +{ - /** - * ID of the last procesed article - */ - private int articleID; + /** + * ID of the last procesed article + */ + private int articleID; - /** - * List of ChonoInfo's - */ - private List list; + /** + * List of ChonoInfo's + */ + private List list; + /** + * Creates a new ChronoIndex object. + */ + public ChronoIndex() + { - /** - * Creates a new ChronoIndex object. - */ - public ChronoIndex() { + super(); - super(); - - this.list = null; - } - - /** - * Creates a new ChronoIndex object. - * - * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET - */ - public ChronoIndex(final long MAX_ALLOWED_PACKET) { - - super("INSERT INTO index_chronological VALUES ", MAX_ALLOWED_PACKET); + this.list = null; + } - this.list = null; - } + /** + * Creates a new ChronoIndex object. + * + * @param MAX_ALLOWED_PACKET + * MAX_ALLOWED_PACKET + */ + public ChronoIndex(final long MAX_ALLOWED_PACKET) + { - /** - * Adds the information for an new entry in the chrono index. - * - * @param articleID ID of the article - * @param revisionCounter Revision counter - * @param timestamp Timestamp - */ - public void add(final int articleID, final int revisionCounter, - final long timestamp) { + super("INSERT INTO index_chronological VALUES ", MAX_ALLOWED_PACKET); - if (this.articleID != articleID) { + this.list = null; + } - if (list != null) { - addToBuffer(); - } + /** + * Adds the information for an new entry in the chrono index. + * + * @param articleID + * ID of the article + * @param revisionCounter + * Revision counter + * @param timestamp + * Timestamp + */ + public void add(final int articleID, final int revisionCounter, final long timestamp) + { + + if (this.articleID != articleID) { + + if (list != null) { + addToBuffer(); + } + + this.articleID = articleID; + this.list = new ArrayList<>(); + } - this.articleID = articleID; - this.list = new ArrayList<>(); + this.list.add(new ChronoIndexData(timestamp, revisionCounter)); } - this.list.add(new ChronoIndexData(timestamp, revisionCounter)); - } + /** + * Creates the mapping and the reverse mapping. The generated information will be added to the + * query buffer. This list will be cleared afterwards. + */ + private void addToBuffer() + { - /** - * Creates the mapping and the reverse mapping. The generated information - * will be added to the query buffer. This list will be cleared afterwards. - */ - private void addToBuffer() { + if (list != null && !list.isEmpty()) { - if (list != null && !list.isEmpty()) { + ChronoIndexData info; - ChronoIndexData info; + // Real index in revision history mapped to RevisionCounter + // Sorted by real index (time) in ascending order + Collections.sort(list); - // Real index in revision history mapped to RevisionCounter - // Sorted by real index (time) in ascending order - Collections.sort(list); + StringBuilder reverseMapping = new StringBuilder(); - StringBuilder reverseMapping = new StringBuilder(); + int size = list.size(); + for (int i = 1; i <= size; i++) { - int size = list.size(); - for (int i = 1; i <= size; i++) { + info = list.get(i - 1); + if (info.getRevisionCounter() != i) { - info = list.get(i - 1); - if (info.getRevisionCounter() != i) { + if (reverseMapping.length() > 0) { + reverseMapping.append(" "); + } - if (reverseMapping.length() > 0) { - reverseMapping.append(" "); - } + reverseMapping.append(i); + reverseMapping.append(" "); + reverseMapping.append(info.getRevisionCounter()); + } - reverseMapping.append(i); - reverseMapping.append(" "); - reverseMapping.append(info.getRevisionCounter()); - } + info.setIndex(i); + info.setSortFlag(false); + } - info.setIndex(i); - info.setSortFlag(false); - } + // RevisionCounter mapped to real index in revision history + // Sorted by revisionCounters in ascending order + Collections.sort(list); + StringBuilder mapping = new StringBuilder(); - // RevisionCounter mapped to real index in revision history - // Sorted by revisionCounters in ascending order - Collections.sort(list); - StringBuilder mapping = new StringBuilder(); + while (!list.isEmpty()) { - while (!list.isEmpty()) { + info = list.remove(0); + if (info.getRevisionCounter() != info.getIndex()) { - info = list.remove(0); - if (info.getRevisionCounter() != info.getIndex()) { + if (mapping.length() > 0) { + mapping.append(" "); + } - if (mapping.length() > 0) { - mapping.append(" "); - } + mapping.append(info.getRevisionCounter()); + mapping.append(" "); + mapping.append(info.getIndex()); + } + } - mapping.append(info.getRevisionCounter()); - mapping.append(" "); - mapping.append(info.getIndex()); - } - } - - if (mapping.length() > 0) { + if (mapping.length() > 0) { - boolean sql = !insertStatement.isEmpty(); - String val = (sql ? "(" : "") + articleID + (sql ? ",'" : ",\"") + mapping - + (sql ? "','" : "\",\"") + reverseMapping + (sql ? "')" : "\""); + boolean sql = !insertStatement.isEmpty(); + String val = (sql ? "(" : "") + articleID + (sql ? ",'" : ",\"") + mapping + + (sql ? "','" : "\",\"") + reverseMapping + (sql ? "')" : "\""); - if (buffer.length() + val.length() >= MAX_ALLOWED_PACKET) { - storeBuffer(); - } + if (buffer.length() + val.length() >= MAX_ALLOWED_PACKET) { + storeBuffer(); + } - if (sql && buffer.length() > insertStatement.length()) { - buffer.append(","); - } + if (sql && buffer.length() > insertStatement.length()) { + buffer.append(","); + } - buffer.append(val); + buffer.append(val); - if (!sql) { - buffer.append("\n"); + if (!sql) { + buffer.append("\n"); + } + } } - } } - } - - /** - * Finalizes the query in the currently used buffer and creates a new one. - * The finalized query will be added to the list of queries. - */ - @Override - public void finalizeIndex() { - addToBuffer(); - storeBuffer(); - } + + /** + * Finalizes the query in the currently used buffer and creates a new one. The finalized query + * will be added to the list of queries. + */ + @Override + public void finalizeIndex() + { + addToBuffer(); + storeBuffer(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java index 132c4917..e9bcaa35 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/ChronoIndexData.java @@ -20,124 +20,139 @@ /** * This class represents the data used by the ChronoIndex. */ -public class ChronoIndexData implements Comparable { - - /** - * Flag - whether the data should be sorted chronological or in order of the - * revision counter - */ - private boolean chronoSort; - - /** - * Index value (Chronological order position) - */ - private int index; - - /** - * Revision counter - */ - private final int revisionCounter; - - /** - * Timestamp value - */ - private final long time; - - /** - * Creates a new ChronoInfo object. - * - * @param time Timestamp value - * @param revisionCounter RevisionCounter - */ - public ChronoIndexData(final long time, final int revisionCounter) { - this.time = time; - this.revisionCounter = revisionCounter; - this.chronoSort = true; - } - - /** - * Compares this ChronoInfo to the given info. - * - * @return a negative integer, zero, or a positive integer as this object is - * less than, equal to, or greater than the specified object. - */ - @Override - public int compareTo(final ChronoIndexData info) { - - long value; - - if (chronoSort) { - value = this.time - info.time; - } else { - value = this.revisionCounter - info.revisionCounter; +public class ChronoIndexData + implements Comparable +{ + + /** + * Flag - whether the data should be sorted chronological or in order of the revision counter + */ + private boolean chronoSort; + + /** + * Index value (Chronological order position) + */ + private int index; + + /** + * Revision counter + */ + private final int revisionCounter; + + /** + * Timestamp value + */ + private final long time; + + /** + * Creates a new ChronoInfo object. + * + * @param time + * Timestamp value + * @param revisionCounter + * RevisionCounter + */ + public ChronoIndexData(final long time, final int revisionCounter) + { + this.time = time; + this.revisionCounter = revisionCounter; + this.chronoSort = true; } - if (value == 0) { - return 0; - } else if (value > 0) { - return 1; - } else { - return -1; + /** + * Compares this ChronoInfo to the given info. + * + * @return a negative integer, zero, or a positive integer as this object is less than, equal + * to, or greater than the specified object. + */ + @Override + public int compareTo(final ChronoIndexData info) + { + + long value; + + if (chronoSort) { + value = this.time - info.time; + } + else { + value = this.revisionCounter - info.revisionCounter; + } + + if (value == 0) { + return 0; + } + else if (value > 0) { + return 1; + } + else { + return -1; + } } - } - - @Override - public boolean equals(Object obj) { - if (this == obj) { - return true; + + @Override + public boolean equals(Object obj) + { + if (this == obj) { + return true; + } + if (obj == null) { + return false; + } + if (getClass() != obj.getClass()) { + return false; + } + return (this != (ChronoIndexData) obj) ? false : true; } - if (obj == null) { - return false; + + /** + * Returns the index value. + * + * @return index value + */ + public int getIndex() + { + return this.index; + } + + /** + * Returns the revision counter. + * + * @return revision counter + */ + public int getRevisionCounter() + { + return revisionCounter; + } + + /** + * Returns the timestamp value. + * + * @return timestamp value + */ + public long getTime() + { + return time; + } + + /** + * Sets the index value. + * + * @param index + * index value + */ + public void setIndex(final int index) + { + this.index = index; } - if (getClass() != obj.getClass()) { - return false; + + /** + * Sets the sort flag. + * + * @param chronoSort + * TRUE for chronological sorting, FALSE for revision counter sorting + */ + public void setSortFlag(final boolean chronoSort) + { + this.chronoSort = chronoSort; } - return (this != (ChronoIndexData) obj) ? false : true; - } - - /** - * Returns the index value. - * - * @return index value - */ - public int getIndex() { - return this.index; - } - - /** - * Returns the revision counter. - * - * @return revision counter - */ - public int getRevisionCounter() { - return revisionCounter; - } - - /** - * Returns the timestamp value. - * - * @return timestamp value - */ - public long getTime() { - return time; - } - - /** - * Sets the index value. - * - * @param index index value - */ - public void setIndex(final int index) { - this.index = index; - } - - /** - * Sets the sort flag. - * - * @param chronoSort TRUE for chronological sorting, FALSE for revision counter - * sorting - */ - public void setSortFlag(final boolean chronoSort) { - this.chronoSort = chronoSort; - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java index 2ff6708a..1748b754 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/indices/RevisionIndex.java @@ -20,49 +20,58 @@ /** * Index for revision information. */ -public class RevisionIndex extends AbstractIndex { +public class RevisionIndex + extends AbstractIndex +{ - /** - * Creates a new RevisionIndex object. - */ - public RevisionIndex() { - super(); - } + /** + * Creates a new RevisionIndex object. + */ + public RevisionIndex() + { + super(); + } - /** - * Creates a new RevisionIndex object. - * - * @param MAX_ALLOWED_PACKET MAX_ALLOWED_PACKET - */ - public RevisionIndex(final long MAX_ALLOWED_PACKET) { + /** + * Creates a new RevisionIndex object. + * + * @param MAX_ALLOWED_PACKET + * MAX_ALLOWED_PACKET + */ + public RevisionIndex(final long MAX_ALLOWED_PACKET) + { - super("INSERT INTO index_revisionID VALUES ", MAX_ALLOWED_PACKET); - } + super("INSERT INTO index_revisionID VALUES ", MAX_ALLOWED_PACKET); + } - /** - * Adds the information for an new entry in the revision index. - * - * @param revisionID ID of the revision - * @param revisionPrimaryKey PK of the revison - * @param fullRevisionPrimaryKey PK of the related full revison - */ - public void add(final int revisionID, final long revisionPrimaryKey, - final long fullRevisionPrimaryKey) { + /** + * Adds the information for an new entry in the revision index. + * + * @param revisionID + * ID of the revision + * @param revisionPrimaryKey + * PK of the revison + * @param fullRevisionPrimaryKey + * PK of the related full revison + */ + public void add(final int revisionID, final long revisionPrimaryKey, + final long fullRevisionPrimaryKey) + { - boolean sql = !insertStatement.isEmpty(); - if (sql && buffer.length() != insertStatement.length()) { - this.buffer.append(","); - } + boolean sql = !insertStatement.isEmpty(); + if (sql && buffer.length() != insertStatement.length()) { + this.buffer.append(","); + } - this.buffer.append((sql ? "(" : "") + revisionID + "," + revisionPrimaryKey + "," - + fullRevisionPrimaryKey + (sql ? ")" : "")); + this.buffer.append((sql ? "(" : "") + revisionID + "," + revisionPrimaryKey + "," + + fullRevisionPrimaryKey + (sql ? ")" : "")); - if (!sql) { - buffer.append("\n"); - } + if (!sql) { + buffer.append("\n"); + } - if (buffer.length() + 100 >= MAX_ALLOWED_PACKET) { - storeBuffer(); + if (buffer.length() + 100 >= MAX_ALLOWED_PACKET) { + storeBuffer(); + } } - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java index e023a320..b491977d 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DataFileWriter.java @@ -32,84 +32,100 @@ /** * This class writes the output of the index generator to an SQL file. */ -public class DataFileWriter implements IndexWriterInterface { +public class DataFileWriter + implements IndexWriterInterface +{ - /** - * Reference to the Writer object - */ - private final Writer chronoIdxWriter; - private final Writer revisionIdxWriter; - private final Writer articleIdxWriter; + /** + * Reference to the Writer object + */ + private final Writer chronoIdxWriter; + private final Writer revisionIdxWriter; + private final Writer articleIdxWriter; - /** - * Creates a new SQLFileWriter. - * - * @param config Reference to the configuration parameters - * @throws IOException if an error occurred while writing the file - */ - public DataFileWriter(final RevisionAPIConfiguration config) throws IOException { + /** + * Creates a new SQLFileWriter. + * + * @param config + * Reference to the configuration parameters + * @throws IOException + * if an error occurred while writing the file + */ + public DataFileWriter(final RevisionAPIConfiguration config) throws IOException + { - File path = new File(config.getOutputPath()); - chronoIdxWriter = new BufferedWriter(new FileWriter(new File(path, "chronoIndex.csv"))); - revisionIdxWriter = new BufferedWriter(new FileWriter(new File(path, "revisionIndex.csv"))); - articleIdxWriter = new BufferedWriter(new FileWriter(new File(path, "articleIndex.csv"))); - } + File path = new File(config.getOutputPath()); + chronoIdxWriter = new BufferedWriter(new FileWriter(new File(path, "chronoIndex.csv"))); + revisionIdxWriter = new BufferedWriter(new FileWriter(new File(path, "revisionIndex.csv"))); + articleIdxWriter = new BufferedWriter(new FileWriter(new File(path, "articleIndex.csv"))); + } - /** - * Writes the buffered finalized queries to the output. - * - * @param index Reference to an index - * @throws IOException if an error occurred while writing the output - */ - @Override - public void write(final AbstractIndex index) throws IOException { + /** + * Writes the buffered finalized queries to the output. + * + * @param index + * Reference to an index + * @throws IOException + * if an error occurred while writing the output + */ + @Override + public void write(final AbstractIndex index) throws IOException + { - StringBuilder cmd; + StringBuilder cmd; - while (index.size() > 0) { - System.out.println("Transmit Index [" + index + "]"); - cmd = index.remove(); - if (index instanceof ArticleIndex) { - articleIdxWriter.write(cmd.toString()); - } else if (index instanceof ChronoIndex) { - chronoIdxWriter.write(cmd.toString()); - } else if (index instanceof RevisionIndex) { - revisionIdxWriter.write(cmd.toString()); - } + while (index.size() > 0) { + System.out.println("Transmit Index [" + index + "]"); + cmd = index.remove(); + if (index instanceof ArticleIndex) { + articleIdxWriter.write(cmd.toString()); + } + else if (index instanceof ChronoIndex) { + chronoIdxWriter.write(cmd.toString()); + } + else if (index instanceof RevisionIndex) { + revisionIdxWriter.write(cmd.toString()); + } - } + } - if (index instanceof ArticleIndex) { - articleIdxWriter.flush(); - } else if (index instanceof ChronoIndex) { - chronoIdxWriter.flush(); - } else if (index instanceof RevisionIndex) { - revisionIdxWriter.flush(); + if (index instanceof ArticleIndex) { + articleIdxWriter.flush(); + } + else if (index instanceof ChronoIndex) { + chronoIdxWriter.flush(); + } + else if (index instanceof RevisionIndex) { + revisionIdxWriter.flush(); + } } - } - /** - * Closes the file or the database connection. - * - * @throws IOException if an error occurred while closing the file - */ - @Override - public void close() throws IOException { - articleIdxWriter.close(); - chronoIdxWriter.close(); - revisionIdxWriter.close(); - } + /** + * Closes the file or the database connection. + * + * @throws IOException + * if an error occurred while closing the file + */ + @Override + public void close() throws IOException + { + articleIdxWriter.close(); + chronoIdxWriter.close(); + revisionIdxWriter.close(); + } - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws IOException if an error occurred while writing to the file - */ - @Override - public void finish() throws IOException { - articleIdxWriter.flush(); - chronoIdxWriter.flush(); - revisionIdxWriter.flush(); - } + /** + * Wraps up the index generation process and writes all remaining statements e.g. concerning + * UNCOMPRESSED-Indexes on the created tables. + * + * @throws IOException + * if an error occurred while writing to the file + */ + @Override + public void finish() throws IOException + { + articleIdxWriter.flush(); + chronoIdxWriter.flush(); + revisionIdxWriter.flush(); + } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java index abf18ee5..ee190c80 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/DatabaseWriter.java @@ -28,127 +28,131 @@ /** * This class writes the output of the index generator to a database. */ -public class DatabaseWriter implements IndexWriterInterface { - - /** - * Reference to the database connection - */ - private final Connection connection; - - /** - * Creates a new DatabaseWriter. - * - * @param config Reference to the configuration parameters - * @throws ClassNotFoundException if the JDBC Driver could not be located - * @throws SQLException if an error occurred while creating the index tables - */ - public DatabaseWriter(final RevisionAPIConfiguration config) - throws ClassNotFoundException, SQLException { - - String driverDB = "com.mysql.jdbc.Driver"; - Class.forName(driverDB); - - this.connection = DriverManager - .getConnection("jdbc:mysql://" + config.getHost() + "/" - + config.getDatabase(), config.getUser(), - config.getPassword()); - - Statement statement = connection.createStatement(); - statement.execute("CREATE TABLE index_articleID_rc_ts (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPKs MEDIUMTEXT NOT NULL, " - + "RevisionCounter MEDIUMTEXT NOT NULL, " - + "FirstAppearance BIGINT NOT NULL, " - + "LastAppearance BIGINT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - statement.close(); - - statement = connection.createStatement(); - statement.execute("CREATE TABLE index_revisionID (" - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionPK INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " - + "PRIMARY KEY(RevisionID));"); - statement.close(); - - statement = connection.createStatement(); - statement.execute("CREATE TABLE index_chronological (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Mapping MEDIUMTEXT NOT NULL, " - + "ReverseMapping MEDIUMTEXT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - statement.close(); - - //disable keys now - reenable after inserts - - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_revisionID DISABLE KEYS;"); - statement.close(); - - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_chronological DISABLE KEYS;"); - statement.close(); - } - - /** - * Writes the buffered finalized queries to the output. - * - * @param index Reference to an index - * @throws SQLException if an error occurred while transmitting the output - */ - @Override - public void write(final AbstractIndex index) - throws SQLException { - - Statement statement; - StringBuilder cmd; - - while (index.size() > 0) { - - System.out.println("Transmit Index [" + index + "]"); - - cmd = index.remove(); - // System.out.println(cmd.toString()); - - statement = connection.createStatement(); - statement.execute(cmd.toString()); - statement.close(); +public class DatabaseWriter + implements IndexWriterInterface +{ + + /** + * Reference to the database connection + */ + private final Connection connection; + + /** + * Creates a new DatabaseWriter. + * + * @param config + * Reference to the configuration parameters + * @throws ClassNotFoundException + * if the JDBC Driver could not be located + * @throws SQLException + * if an error occurred while creating the index tables + */ + public DatabaseWriter(final RevisionAPIConfiguration config) + throws ClassNotFoundException, SQLException + { + + String driverDB = "com.mysql.jdbc.Driver"; + Class.forName(driverDB); + + this.connection = DriverManager.getConnection( + "jdbc:mysql://" + config.getHost() + "/" + config.getDatabase(), config.getUser(), + config.getPassword()); + + Statement statement = connection.createStatement(); + statement.execute("CREATE TABLE index_articleID_rc_ts (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + "FullRevisionPKs MEDIUMTEXT NOT NULL, " + + "RevisionCounter MEDIUMTEXT NOT NULL, " + "FirstAppearance BIGINT NOT NULL, " + + "LastAppearance BIGINT NOT NULL, " + "PRIMARY KEY(ArticleID));"); + statement.close(); + + statement = connection.createStatement(); + statement.execute("CREATE TABLE index_revisionID (" + + "RevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionPK INTEGER UNSIGNED NOT NULL, " + + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " + "PRIMARY KEY(RevisionID));"); + statement.close(); + + statement = connection.createStatement(); + statement.execute("CREATE TABLE index_chronological (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + "Mapping MEDIUMTEXT NOT NULL, " + + "ReverseMapping MEDIUMTEXT NOT NULL, " + "PRIMARY KEY(ArticleID));"); + statement.close(); + + // disable keys now - reenable after inserts + + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_revisionID DISABLE KEYS;"); + statement.close(); + + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_chronological DISABLE KEYS;"); + statement.close(); + } + + /** + * Writes the buffered finalized queries to the output. + * + * @param index + * Reference to an index + * @throws SQLException + * if an error occurred while transmitting the output + */ + @Override + public void write(final AbstractIndex index) throws SQLException + { + + Statement statement; + StringBuilder cmd; + + while (index.size() > 0) { + + System.out.println("Transmit Index [" + index + "]"); + + cmd = index.remove(); + // System.out.println(cmd.toString()); + + statement = connection.createStatement(); + statement.execute(cmd.toString()); + statement.close(); + } + } + + /** + * Wraps up the index generation process and writes all remaining statements e.g. concerning + * UNCOMPRESSED-Indexes on the created tables. + * + * @throws SQLException + * if an error occurred while accessing the database + */ + @Override + public void finish() throws SQLException + { + Statement statement = connection.createStatement(); + statement.execute("CREATE INDEX articleIdx on revisions(ArticleID);"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_revisionID ENABLE KEYS;"); + statement.close(); + statement = connection.createStatement(); + statement.execute("ALTER TABLE index_chronological ENABLE KEYS;"); + statement.close(); + } + + /** + * Closes the file or the database connection. + * + * @throws SQLException + * if an error occurred while closing the database connection + */ + @Override + public void close() throws SQLException + { + this.connection.close(); } - } - - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws SQLException if an error occurred while accessing the database - */ - @Override - public void finish() throws SQLException { - Statement statement = connection.createStatement(); - statement.execute("CREATE INDEX articleIdx on revisions(ArticleID);"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_revisionID ENABLE KEYS;"); - statement.close(); - statement = connection.createStatement(); - statement.execute("ALTER TABLE index_chronological ENABLE KEYS;"); - statement.close(); - } - - /** - * Closes the file or the database connection. - * - * @throws SQLException if an error occurred while closing the database connection - */ - @Override - public void close() - throws SQLException { - this.connection.close(); - } } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java index bcfadceb..4084bb66 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/IndexWriterInterface.java @@ -25,31 +25,39 @@ /** * Interface for the IndexWriter */ -public interface IndexWriterInterface { +public interface IndexWriterInterface +{ - /** - * Writes the buffered finalized queries to the output. - * - * @param index Reference to an index - * @throws IOException if an error occurred while writing the output - * @throws SQLException if an error occurred while transmitting the output - */ - void write(final AbstractIndex index) throws IOException, SQLException; + /** + * Writes the buffered finalized queries to the output. + * + * @param index + * Reference to an index + * @throws IOException + * if an error occurred while writing the output + * @throws SQLException + * if an error occurred while transmitting the output + */ + void write(final AbstractIndex index) throws IOException, SQLException; - /** - * Closes the file or the database connection. - * - * @throws IOException if an error occurred while closing the file - * @throws SQLException if an error occurred while closing the database connection - */ - void close() throws IOException, SQLException; + /** + * Closes the file or the database connection. + * + * @throws IOException + * if an error occurred while closing the file + * @throws SQLException + * if an error occurred while closing the database connection + */ + void close() throws IOException, SQLException; - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws SQLException if an error occurred while accessing the database - * @throws IOException if an error occurred while accessing the sql file - */ - void finish() throws IOException, SQLException; + /** + * Wraps up the index generation process and writes all remaining statements e.g. concerning + * UNCOMPRESSED-Indexes on the created tables. + * + * @throws SQLException + * if an error occurred while accessing the database + * @throws IOException + * if an error occurred while accessing the sql file + */ + void finish() throws IOException, SQLException; } diff --git a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java index f0cc0410..1ea370c3 100644 --- a/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java +++ b/dkpro-jwpl-revisionmachine/src/main/java/org/dkpro/jwpl/revisionmachine/index/writer/SQLFileWriter.java @@ -29,97 +29,102 @@ /** * This class writes the output of the index generator to an sql file. */ -public class SQLFileWriter implements IndexWriterInterface { - - /** - * Reference to the Writer object - */ - private final Writer writer; - - /** - * Creates a new SQLFileWriter. - * - * @param config Reference to the configuration parameters - * @throws IOException if an error occurred while writing the file - */ - public SQLFileWriter(final RevisionAPIConfiguration config) - throws IOException { - - writer = new BufferedWriter(new FileWriter(new File(config.getOutputPath(), "revisionIndex.sql"))); - - writer.write("CREATE TABLE index_articleID_rc_ts (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPKs MEDIUMTEXT NOT NULL, " - + "RevisionCounter MEDIUMTEXT NOT NULL, " - + "FirstAppearance BIGINT NOT NULL, " - + "LastAppearance BIGINT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - - writer.write("CREATE TABLE index_revisionID (" - + "RevisionID INTEGER UNSIGNED NOT NULL, " - + "RevisionPK INTEGER UNSIGNED NOT NULL, " - + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " - + "PRIMARY KEY(RevisionID));"); - - writer.write("CREATE TABLE index_chronological (" - + "ArticleID INTEGER UNSIGNED NOT NULL, " - + "Mapping MEDIUMTEXT NOT NULL, " - + "ReverseMapping MEDIUMTEXT NOT NULL, " - + "PRIMARY KEY(ArticleID));"); - writer.write("\r\n"); - - //disable keys now - reenable at the end of the sql file - writer.write("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_revisionID DISABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_chronological DISABLE KEYS;\r\n"); - - writer.flush(); - } - - /** - * Writes the buffered finalized queries to the output. - * - * @param index Reference to an index - * @throws IOException if an error occurred while writing the output - */ - @Override - public void write(final AbstractIndex index) throws IOException { - - StringBuilder cmd; - while (index.size() > 0) { - System.out.println("Transmit Index [" + index + "]"); - cmd = index.remove(); - cmd.append("\r\n"); - writer.write(cmd.toString()); +public class SQLFileWriter + implements IndexWriterInterface +{ + + /** + * Reference to the Writer object + */ + private final Writer writer; + + /** + * Creates a new SQLFileWriter. + * + * @param config + * Reference to the configuration parameters + * @throws IOException + * if an error occurred while writing the file + */ + public SQLFileWriter(final RevisionAPIConfiguration config) throws IOException + { + + writer = new BufferedWriter( + new FileWriter(new File(config.getOutputPath(), "revisionIndex.sql"))); + + writer.write("CREATE TABLE index_articleID_rc_ts (" + + "ArticleID INTEGER UNSIGNED NOT NULL, " + "FullRevisionPKs MEDIUMTEXT NOT NULL, " + + "RevisionCounter MEDIUMTEXT NOT NULL, " + "FirstAppearance BIGINT NOT NULL, " + + "LastAppearance BIGINT NOT NULL, " + "PRIMARY KEY(ArticleID));"); + + writer.write("CREATE TABLE index_revisionID (" + "RevisionID INTEGER UNSIGNED NOT NULL, " + + "RevisionPK INTEGER UNSIGNED NOT NULL, " + + "FullRevisionPK INTEGER UNSIGNED NOT NULL, " + "PRIMARY KEY(RevisionID));"); + + writer.write("CREATE TABLE index_chronological (" + "ArticleID INTEGER UNSIGNED NOT NULL, " + + "Mapping MEDIUMTEXT NOT NULL, " + "ReverseMapping MEDIUMTEXT NOT NULL, " + + "PRIMARY KEY(ArticleID));"); + writer.write("\r\n"); + + // disable keys now - reenable at the end of the sql file + writer.write("ALTER TABLE index_articleID_rc_ts DISABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_revisionID DISABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_chronological DISABLE KEYS;\r\n"); + + writer.flush(); } - writer.flush(); - } - - /** - * Closes the file or the database connection. - * - * @throws IOException if an error occurred while closing the file - */ - @Override - public void close() throws IOException { - this.writer.close(); - } - - /** - * Wraps up the index generation process and writes all remaining statements - * e.g. concerning UNCOMPRESSED-Indexes on the created tables. - * - * @throws IOException if an error occurred while writing to the file - */ - @Override - public void finish() throws IOException { - - writer.write("CREATE INDEX articleIdx ON revisions(ArticleID);\r\n"); - writer.write("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_revisionID ENABLE KEYS;\r\n"); - writer.write("ALTER TABLE index_chronological ENABLE KEYS;\r\n"); - writer.flush(); - - } + /** + * Writes the buffered finalized queries to the output. + * + * @param index + * Reference to an index + * @throws IOException + * if an error occurred while writing the output + */ + @Override + public void write(final AbstractIndex index) throws IOException + { + + StringBuilder cmd; + while (index.size() > 0) { + System.out.println("Transmit Index [" + index + "]"); + cmd = index.remove(); + cmd.append("\r\n"); + writer.write(cmd.toString()); + } + + writer.flush(); + } + + /** + * Closes the file or the database connection. + * + * @throws IOException + * if an error occurred while closing the file + */ + @Override + public void close() throws IOException + { + this.writer.close(); + } + + /** + * Wraps up the index generation process and writes all remaining statements e.g. concerning + * UNCOMPRESSED-Indexes on the created tables. + * + * @throws IOException + * if an error occurred while writing to the file + */ + @Override + public void finish() throws IOException + { + + writer.write("CREATE INDEX articleIdx ON revisions(ArticleID);\r\n"); + writer.write("ALTER TABLE index_articleID_rc_ts ENABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_revisionID ENABLE KEYS;\r\n"); + writer.write("ALTER TABLE index_chronological ENABLE KEYS;\r\n"); + writer.flush(); + + } } diff --git a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/BaseJWPLTest.java b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/BaseJWPLTest.java index 5c4f89da..e1be06b7 100644 --- a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/BaseJWPLTest.java +++ b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/BaseJWPLTest.java @@ -22,20 +22,21 @@ import org.dkpro.jwpl.api.Wikipedia; /** - * Simple test base class to inject the same hsqldb test context into every test - * class to avoid duplicated code and efforts. Also shuts down the - * hibernate/hsqldb context properly. + * Simple test base class to inject the same hsqldb test context into every test class to avoid + * duplicated code and efforts. Also shuts down the hibernate/hsqldb context properly. * * @author mawiesne */ -public abstract class BaseJWPLTest { +public abstract class BaseJWPLTest +{ - protected static Wikipedia wiki; + protected static Wikipedia wiki; - protected static final DatabaseConfiguration obtainHSDLDBConfiguration(String databaseName, Language language) { - return new DatabaseConfiguration("org.hsqldb.jdbcDriver", - "jdbc:hsqldb:file:./src/test/resources/db/"+databaseName, - "localhost", databaseName, "sa", - "", language); - } + protected static final DatabaseConfiguration obtainHSDLDBConfiguration(String databaseName, + Language language) + { + return new DatabaseConfiguration("org.hsqldb.jdbcDriver", + "jdbc:hsqldb:file:./src/test/resources/db/" + databaseName, "localhost", + databaseName, "sa", "", language); + } } diff --git a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java index 436e6a5f..4e476883 100644 --- a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java +++ b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionApiTest.java @@ -40,208 +40,224 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; -public class RevisionApiTest extends BaseJWPLTest { - - private static Wikipedia wiki = null; - - private Timestamp convertToUTC(Timestamp ts) { - - final LocalDateTime dt = LocalDateTime.ofInstant(ts.toInstant(), ZoneOffset.UTC); - return Timestamp.valueOf(dt); - } - - // The object under test - private RevisionApi revisionApi; - - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration( - "wikiapi_simple_20090119_stripped", Language.simple_english); - try { - wiki = new Wikipedia(db); - } catch (Exception e) { - fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); - } - assertNotNull(wiki); - } - - @BeforeEach - public void setupInstanceUnderTest() { - try { - revisionApi = new RevisionApi(wiki.getDatabaseConfiguration()); - assertNotNull(revisionApi); - } catch (WikiApiException e) { - fail("RevisionApi could not be initialized: " + e.getLocalizedMessage()); - } - } - - @AfterEach - public void cleanUpInstanceUnderTest() { - if(revisionApi!=null) { - try { - revisionApi.close(); - } catch (SQLException e) { - fail("RevisionApi could not be shut down correctly: " + e.getLocalizedMessage()); - } - } - } - - @Test - public void getRevisionByTimestampTest() { - Calendar calendar = Calendar.getInstance(); - calendar.set(2008, 10, 10, 10, 10, 10); - - try { - - int pageId = wiki.getPage("Car").getPageId(); - - Timestamp timestamp = new Timestamp(calendar.getTimeInMillis()); - Revision revision = revisionApi.getRevision(pageId, timestamp); - - assertEquals(1142935, revision.getRevisionID()); - assertEquals(0, revision.getFullRevisionID()); - assertEquals(349, revision.getRevisionCounter()); - assertEquals(pageId, revision.getArticleID()); - } - catch (WikiApiException e) { - fail(e.getMessage(), e); - } - } - - @Test - public void getRevisionByRevisionId() { - Calendar calendar = Calendar.getInstance(); - calendar.set(2008, 10, 10, 10, 10, 10); - - try { - int pageId = wiki.getPage("Car").getPageId(); - - Timestamp timestamp = new Timestamp(calendar.getTimeInMillis()); - - Revision revision1 = revisionApi.getRevision(1142935); - Revision revision2 = revisionApi.getRevision(pageId, timestamp); - - assertEquals(1142935, revision1.getRevisionID()); - assertEquals(0, revision1.getFullRevisionID()); - assertEquals(349, revision1.getRevisionCounter()); - - assertEquals(revision1.getRevisionID(), revision2.getRevisionID()); - assertEquals(revision1.getFullRevisionID(), revision2.getFullRevisionID()); - assertEquals(revision1.getRevisionCounter(), revision2.getRevisionCounter()); - assertEquals(revision1.getArticleID(), revision2.getArticleID()); - - } - catch (WikiApiException e) { - fail(e.getMessage(), e); - } - } - - @Test - public void getRevisionByRevisionCounter() { - Calendar calendar = Calendar.getInstance(); - calendar.set(2008, 10, 10, 10, 10, 10); - - try { - int pageId = wiki.getPage("Car").getPageId(); - - Timestamp timestamp = new Timestamp(calendar.getTimeInMillis()); - - Revision revision1 = revisionApi.getRevision(pageId, 349); - Revision revision2 = revisionApi.getRevision(pageId, timestamp); - - assertEquals(1142935, revision1.getRevisionID()); - assertEquals(0, revision1.getFullRevisionID()); - assertEquals(349, revision1.getRevisionCounter()); - - assertEquals(revision1.getRevisionID(), revision2.getRevisionID()); - assertEquals(revision1.getFullRevisionID(), revision2.getFullRevisionID()); - assertEquals(revision1.getRevisionCounter(), revision2.getRevisionCounter()); - assertEquals(revision1.getArticleID(), revision2.getArticleID()); - - } - catch (WikiApiException e) { - fail(e.getMessage(), e); - } - } - - @Test - public void articleIDTests() { - Calendar calendar = Calendar.getInstance(); - calendar.set(2008, 10, 10, 10, 10, 10); - - try { - int pageId = wiki.getPage("Car").getPageId(); - - Timestamp firstDayOfAppearance = convertToUTC(revisionApi.getFirstDateOfAppearance(pageId)); - Timestamp lastDayOfAppearance = convertToUTC(revisionApi.getLastDateOfAppearance(pageId)); - int nrOfRevisions = revisionApi.getNumberOfRevisions(pageId); - - assertEquals("2004-04-07 00:31:34.0", firstDayOfAppearance.toString()); - assertEquals("2009-01-19 03:58:09.0", lastDayOfAppearance.toString()); - assertEquals(382, nrOfRevisions); - } - catch (WikiApiException e) { - fail(e.getMessage(), e); - } - } - - @Test - @Disabled - public void lastRevisionTest() { - Calendar calendar = Calendar.getInstance(); - calendar.set(2008, 10, 10, 10, 10, 10); - - String pageName = "Car"; - try { - int pageId = wiki.getPage(pageName).getPageId(); - - Timestamp lastRevisionTimestamp = revisionApi.getLastDateOfAppearance(pageId); - Revision revision = revisionApi.getRevision(pageId, lastRevisionTimestamp); - // FIXME the comparison shall hold - Check: flattened in one line vs. multiple lines - assertEquals(wiki.getPage(pageId).getText(), revision.getRevisionText()); - } - catch (WikiApiException e) { - fail(e.getMessage(), e); - } - } - - - @Test - public void lazyLoadingTest() { - Calendar calendar = Calendar.getInstance(); - calendar.set(2008, 10, 10, 10, 10, 10); - - try { - int pageId = wiki.getPage("Car").getPageId(); - - Timestamp lastRevisionTimestamp = revisionApi.getLastDateOfAppearance(pageId); - Revision revision = revisionApi.getRevision(pageId, lastRevisionTimestamp); - - Field privateStringField = Revision.class.getDeclaredField("revisionText"); - privateStringField.setAccessible(true); - - String fieldValue = (String) privateStringField.get(revision); - if (fieldValue != null) { - fail("Not lazy loaded!"); - } - - // trigger the load of the data - revision.getRevisionText(); - - fieldValue = (String) privateStringField.get(revision); - if (fieldValue == null) { - fail("Not lazy loaded!"); - } - - } catch (WikiApiException | SecurityException | NoSuchFieldException | - IllegalArgumentException | IllegalAccessException e) { - fail(e.getMessage(), e); - } - } +public class RevisionApiTest + extends BaseJWPLTest +{ + + private static Wikipedia wiki = null; + + private Timestamp convertToUTC(Timestamp ts) + { + + final LocalDateTime dt = LocalDateTime.ofInstant(ts.toInstant(), ZoneOffset.UTC); + return Timestamp.valueOf(dt); + } + + // The object under test + private RevisionApi revisionApi; + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration("wikiapi_simple_20090119_stripped", + Language.simple_english); + try { + wiki = new Wikipedia(db); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + assertNotNull(wiki); + } + + @BeforeEach + public void setupInstanceUnderTest() + { + try { + revisionApi = new RevisionApi(wiki.getDatabaseConfiguration()); + assertNotNull(revisionApi); + } + catch (WikiApiException e) { + fail("RevisionApi could not be initialized: " + e.getLocalizedMessage()); + } + } + + @AfterEach + public void cleanUpInstanceUnderTest() + { + if (revisionApi != null) { + try { + revisionApi.close(); + } + catch (SQLException e) { + fail("RevisionApi could not be shut down correctly: " + e.getLocalizedMessage()); + } + } + } + + @Test + public void getRevisionByTimestampTest() + { + Calendar calendar = Calendar.getInstance(); + calendar.set(2008, 10, 10, 10, 10, 10); + + try { + + int pageId = wiki.getPage("Car").getPageId(); + + Timestamp timestamp = new Timestamp(calendar.getTimeInMillis()); + Revision revision = revisionApi.getRevision(pageId, timestamp); + + assertEquals(1142935, revision.getRevisionID()); + assertEquals(0, revision.getFullRevisionID()); + assertEquals(349, revision.getRevisionCounter()); + assertEquals(pageId, revision.getArticleID()); + } + catch (WikiApiException e) { + fail(e.getMessage(), e); + } + } + + @Test + public void getRevisionByRevisionId() + { + Calendar calendar = Calendar.getInstance(); + calendar.set(2008, 10, 10, 10, 10, 10); + + try { + int pageId = wiki.getPage("Car").getPageId(); + + Timestamp timestamp = new Timestamp(calendar.getTimeInMillis()); + + Revision revision1 = revisionApi.getRevision(1142935); + Revision revision2 = revisionApi.getRevision(pageId, timestamp); + + assertEquals(1142935, revision1.getRevisionID()); + assertEquals(0, revision1.getFullRevisionID()); + assertEquals(349, revision1.getRevisionCounter()); + + assertEquals(revision1.getRevisionID(), revision2.getRevisionID()); + assertEquals(revision1.getFullRevisionID(), revision2.getFullRevisionID()); + assertEquals(revision1.getRevisionCounter(), revision2.getRevisionCounter()); + assertEquals(revision1.getArticleID(), revision2.getArticleID()); + + } + catch (WikiApiException e) { + fail(e.getMessage(), e); + } + } + + @Test + public void getRevisionByRevisionCounter() + { + Calendar calendar = Calendar.getInstance(); + calendar.set(2008, 10, 10, 10, 10, 10); + + try { + int pageId = wiki.getPage("Car").getPageId(); + + Timestamp timestamp = new Timestamp(calendar.getTimeInMillis()); + + Revision revision1 = revisionApi.getRevision(pageId, 349); + Revision revision2 = revisionApi.getRevision(pageId, timestamp); + + assertEquals(1142935, revision1.getRevisionID()); + assertEquals(0, revision1.getFullRevisionID()); + assertEquals(349, revision1.getRevisionCounter()); + + assertEquals(revision1.getRevisionID(), revision2.getRevisionID()); + assertEquals(revision1.getFullRevisionID(), revision2.getFullRevisionID()); + assertEquals(revision1.getRevisionCounter(), revision2.getRevisionCounter()); + assertEquals(revision1.getArticleID(), revision2.getArticleID()); + + } + catch (WikiApiException e) { + fail(e.getMessage(), e); + } + } + + @Test + public void articleIDTests() + { + Calendar calendar = Calendar.getInstance(); + calendar.set(2008, 10, 10, 10, 10, 10); + + try { + int pageId = wiki.getPage("Car").getPageId(); + + Timestamp firstDayOfAppearance = convertToUTC( + revisionApi.getFirstDateOfAppearance(pageId)); + Timestamp lastDayOfAppearance = convertToUTC( + revisionApi.getLastDateOfAppearance(pageId)); + int nrOfRevisions = revisionApi.getNumberOfRevisions(pageId); + + assertEquals("2004-04-07 00:31:34.0", firstDayOfAppearance.toString()); + assertEquals("2009-01-19 03:58:09.0", lastDayOfAppearance.toString()); + assertEquals(382, nrOfRevisions); + } + catch (WikiApiException e) { + fail(e.getMessage(), e); + } + } + + @Test + @Disabled + public void lastRevisionTest() + { + Calendar calendar = Calendar.getInstance(); + calendar.set(2008, 10, 10, 10, 10, 10); + + String pageName = "Car"; + try { + int pageId = wiki.getPage(pageName).getPageId(); + + Timestamp lastRevisionTimestamp = revisionApi.getLastDateOfAppearance(pageId); + Revision revision = revisionApi.getRevision(pageId, lastRevisionTimestamp); + // FIXME the comparison shall hold - Check: flattened in one line vs. multiple lines + assertEquals(wiki.getPage(pageId).getText(), revision.getRevisionText()); + } + catch (WikiApiException e) { + fail(e.getMessage(), e); + } + } + + @Test + public void lazyLoadingTest() + { + Calendar calendar = Calendar.getInstance(); + calendar.set(2008, 10, 10, 10, 10, 10); + + try { + int pageId = wiki.getPage("Car").getPageId(); + + Timestamp lastRevisionTimestamp = revisionApi.getLastDateOfAppearance(pageId); + Revision revision = revisionApi.getRevision(pageId, lastRevisionTimestamp); + + Field privateStringField = Revision.class.getDeclaredField("revisionText"); + privateStringField.setAccessible(true); + + String fieldValue = (String) privateStringField.get(revision); + if (fieldValue != null) { + fail("Not lazy loaded!"); + } + + // trigger the load of the data + revision.getRevisionText(); + + fieldValue = (String) privateStringField.get(revision); + if (fieldValue == null) { + fail("Not lazy loaded!"); + } + + } + catch (WikiApiException | SecurityException | NoSuchFieldException + | IllegalArgumentException | IllegalAccessException e) { + fail(e.getMessage(), e); + } + } } diff --git a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java index 19a2ec40..1830653c 100644 --- a/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java +++ b/dkpro-jwpl-revisionmachine/src/test/java/org/dkpro/jwpl/revisionmachine/RevisionIteratorTest.java @@ -37,124 +37,133 @@ import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; -public class RevisionIteratorTest extends BaseJWPLTest { +public class RevisionIteratorTest + extends BaseJWPLTest +{ // Note: In the stripped HSQLDB data set only 382 revisions exist for the Page 'Car' - private static final int GLOBAL_REVISION_COUNT = 382; + private static final int GLOBAL_REVISION_COUNT = 382; private static Wikipedia wiki = null; - private static RevisionAPIConfiguration config = null; - - // The object under test - private RevisionIterator revisionIterator = null; - - /** - * Made this static so that following tests don't run if assumption fails. - * (With AT_Before, tests also would not be executed but marked as passed) - * This could be changed back as soon as JUnit ignored tests after failed - * assumptions - */ - @BeforeAll - public static void setupWikipedia() { - DatabaseConfiguration db = obtainHSDLDBConfiguration( - "wikiapi_simple_20090119_stripped", Language.simple_english); - try { - wiki = new Wikipedia(db); - config = new RevisionAPIConfiguration(wiki.getDatabaseConfiguration()); - } catch (Exception e) { - fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); - } - assertNotNull(wiki); - assertNotNull(config); - - } - - @BeforeEach - public void setupInstanceUnderTest() - { - try { - revisionIterator = new RevisionIterator(config); - assertNotNull(revisionIterator); - } catch (WikiApiException e) { - fail("RevisionIterator could not be initialized: " + e.getLocalizedMessage()); - } - } - - @AfterEach - public void cleanUpInstanceUnderTest() - { - if(revisionIterator!=null) { - try { - revisionIterator.close(); - } catch (SQLException e) { - fail("RevisionIterator could not be shut down correctly: " + e.getLocalizedMessage()); - } - } - } - - @Test - public void iteratorTest() { - - int i = 0; - - while (revisionIterator.hasNext() && i < 500) { - Revision revision = revisionIterator.next(); - assertNotNull(revision); - assertTrue(revision.getArticleID() > 0); + private static RevisionAPIConfiguration config = null; + + // The object under test + private RevisionIterator revisionIterator = null; + + /** + * Made this static so that following tests don't run if assumption fails. (With AT_Before, + * tests also would not be executed but marked as passed) This could be changed back as soon as + * JUnit ignored tests after failed assumptions + */ + @BeforeAll + public static void setupWikipedia() + { + DatabaseConfiguration db = obtainHSDLDBConfiguration("wikiapi_simple_20090119_stripped", + Language.simple_english); + try { + wiki = new Wikipedia(db); + config = new RevisionAPIConfiguration(wiki.getDatabaseConfiguration()); + } + catch (Exception e) { + fail("Wikipedia could not be initialized: " + e.getLocalizedMessage()); + } + assertNotNull(wiki); + assertNotNull(config); + + } + + @BeforeEach + public void setupInstanceUnderTest() + { + try { + revisionIterator = new RevisionIterator(config); + assertNotNull(revisionIterator); + } + catch (WikiApiException e) { + fail("RevisionIterator could not be initialized: " + e.getLocalizedMessage()); + } + } + + @AfterEach + public void cleanUpInstanceUnderTest() + { + if (revisionIterator != null) { + try { + revisionIterator.close(); + } + catch (SQLException e) { + fail("RevisionIterator could not be shut down correctly: " + + e.getLocalizedMessage()); + } + } + } + + @Test + public void iteratorTest() + { + + int i = 0; + + while (revisionIterator.hasNext() && i < 500) { + Revision revision = revisionIterator.next(); + assertNotNull(revision); + assertTrue(revision.getArticleID() > 0); assertTrue(revision.getFullRevisionID() > 0); assertTrue(revision.getRevisionCounter() > 0); assertNotNull(revision.getRevisionText()); assertNotNull(revision.getTimeStamp()); - i++; - } - - assertEquals(GLOBAL_REVISION_COUNT, i); - } + i++; + } + assertEquals(GLOBAL_REVISION_COUNT, i); + } - @Test - public void lazyLoadingTest() { - ArrayList texts = new ArrayList<>(); - int i = 0; + @Test + public void lazyLoadingTest() + { + ArrayList texts = new ArrayList<>(); + int i = 0; - while (revisionIterator.hasNext() && i < 500) { - Revision revision = revisionIterator.next(); + while (revisionIterator.hasNext() && i < 500) { + Revision revision = revisionIterator.next(); assertNotNull(revision); - texts.add(revision.getRevisionText()); - i++; - } + texts.add(revision.getRevisionText()); + i++; + } assertEquals(GLOBAL_REVISION_COUNT, i); - ArrayList lazyLoadedTexts = new ArrayList<>(); - i = 0; - - //create new iterator with lazy loading - try{ - revisionIterator = new RevisionIterator(config, true); - }catch (WikiApiException e) { - fail("RevisionIterator could not be initialized with lazy loading = 'true': " + e.getLocalizedMessage()); - } - - while (revisionIterator.hasNext() && i < 1000) { - Revision revision = revisionIterator.next(); - lazyLoadedTexts.add(revision.getRevisionText()); - i++; - } + ArrayList lazyLoadedTexts = new ArrayList<>(); + i = 0; + + // create new iterator with lazy loading + try { + revisionIterator = new RevisionIterator(config, true); + } + catch (WikiApiException e) { + fail("RevisionIterator could not be initialized with lazy loading = 'true': " + + e.getLocalizedMessage()); + } + + while (revisionIterator.hasNext() && i < 1000) { + Revision revision = revisionIterator.next(); + lazyLoadedTexts.add(revision.getRevisionText()); + i++; + } assertEquals(GLOBAL_REVISION_COUNT, i); - for (int j = 0; j < texts.size(); j++) { - if(!texts.get(j).equals(lazyLoadedTexts.get(j))){ - fail(); - } - } - //close iterator - try { - revisionIterator.close(); - } - catch (SQLException e) { + for (int j = 0; j < texts.size(); j++) { + if (!texts.get(j).equals(lazyLoadedTexts.get(j))) { + fail(); + } + } + // close iterator + try { + revisionIterator.close(); + } + catch (SQLException e) { fail("RevisionIterator could not be shut down correctly: " + e.getLocalizedMessage()); - } + } - } + } } From 46cf450c202fb08f34f5174ff33edcd6f136c032 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:27:21 +0100 Subject: [PATCH 10/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-timemachine --- .../timemachine/domain/JWPLTimeMachine.java | 85 +- .../jwpl/timemachine/domain/Revision.java | 119 ++- .../jwpl/timemachine/domain/SettingsXML.java | 158 +-- .../timemachine/domain/TimeMachineFiles.java | 169 +-- .../domain/TimeMachineGenerator.java | 212 ++-- .../version/DumpVersionFastUtilIntKey.java | 579 ++++++----- .../dump/version/DumpVersionJDKGeneric.java | 626 ++++++------ .../version/DumpVersionJDKIntKeyFactory.java | 25 +- .../version/DumpVersionJDKLongKeyFactory.java | 25 +- .../DumpVersionJDKStringKeyFactory.java | 25 +- .../dump/version/OriginalDumpVersion.java | 967 +++++++++--------- .../jwpl/timemachine/dump/xml/PageReader.java | 61 +- .../jwpl/timemachine/dump/xml/PageWriter.java | 103 +- .../timemachine/dump/xml/RevisionReader.java | 61 +- .../timemachine/dump/xml/RevisionWriter.java | 78 +- .../jwpl/timemachine/dump/xml/TextReader.java | 59 +- .../jwpl/timemachine/dump/xml/TextWriter.java | 70 +- .../dump/xml/TimeMachineRevisionParser.java | 30 +- .../dump/xml/XMLDumpTableInputStream.java | 127 +-- .../xml/XMLDumpTableInputStreamThread.java | 122 ++- 20 files changed, 1973 insertions(+), 1728 deletions(-) diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java index 45810b2e..13af4a4b 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/JWPLTimeMachine.java @@ -25,60 +25,63 @@ /** * This is the main class of the DBMapping Tool of the JWPL.
- * The main method gets the path of a configuration file as - * argument
+ * The main method gets the path of a configuration file as argument
*
*

* Refactored on 16 April 2009 by Ivan Galkin . */ -public class JWPLTimeMachine { +public class JWPLTimeMachine +{ - private static final IEnvironmentFactory environmentFactory = SpringFactory.getInstance(); + private static final IEnvironmentFactory environmentFactory = SpringFactory.getInstance(); - private static final long startTime = System.currentTimeMillis(); - private static final ILogger logger = environmentFactory.getLogger(); + private static final long startTime = System.currentTimeMillis(); + private static final ILogger logger = environmentFactory.getLogger(); - /** - * Checks given arguments - * - * @param args
- * args[0] the settings file like described in - * {@link SettingsXML}
- * @return true if all necessary arguments are given and false otherwise - * @see SettingsXML - */ - private static boolean checkArgs(String[] args) { - boolean result = (args.length > 0); - if (!result) { - System.out.println("Usage: java -jar JWPLTimeMachine.jar "); + /** + * Checks given arguments + * + * @param args + *
+ * args[0] the settings file like described in {@link SettingsXML}
+ * @return true if all necessary arguments are given and false otherwise + * @see SettingsXML + */ + private static boolean checkArgs(String[] args) + { + boolean result = (args.length > 0); + if (!result) { + System.out.println("Usage: java -jar JWPLTimeMachine.jar "); + } + return result; } - return result; - } - public static void main(String[] args) { + public static void main(String[] args) + { - try { - if (checkArgs(args)) { - logger.log("parsing configuration file...."); - Configuration config = SettingsXML.loadConfiguration(args[0], logger); - TimeMachineFiles files = SettingsXML.loadFiles(args[0], logger); + try { + if (checkArgs(args)) { + logger.log("parsing configuration file...."); + Configuration config = SettingsXML.loadConfiguration(args[0], logger); + TimeMachineFiles files = SettingsXML.loadFiles(args[0], logger); - if (config != null && files != null) { - if (files.checkAll() && config.checkTimeConfig()) { - logger.log("processing data ..."); + if (config != null && files != null) { + if (files.checkAll() && config.checkTimeConfig()) { + logger.log("processing data ..."); - ISnapshotGenerator generator = environmentFactory - .getSnapshotGenerator(); - generator.setConfiguration(config); - generator.setFiles(files); - generator.start(); + ISnapshotGenerator generator = environmentFactory.getSnapshotGenerator(); + generator.setConfiguration(config); + generator.setFiles(files); + generator.start(); - logger.log("End of the application. Working time = " + (System.currentTimeMillis() - startTime) + " ms"); - } + logger.log("End of the application. Working time = " + + (System.currentTimeMillis() - startTime) + " ms"); + } + } + } + } + catch (Exception e) { + logger.log(e); } - } - } catch (Exception e) { - logger.log(e); } - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java index 9baf8bac..1c200bdf 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/Revision.java @@ -20,68 +20,75 @@ /** * Routines for the conversion of the Wikipedia revisions */ -public class Revision { +public class Revision +{ - /** - * Calendar.getInstance().set(2000,0,1) out relative time zero to saving - * memory - */ - private static final Long TIME_ZERO = 946724195435L; - /** - * We measure the time not from 1th January 1900 but from 1th January 2000 - */ - private static final Integer MS_IN_SEC = 1000; + /** + * Calendar.getInstance().set(2000,0,1) out relative time zero to saving memory + */ + private static final Long TIME_ZERO = 946724195435L; + /** + * We measure the time not from 1th January 1900 but from 1th January 2000 + */ + private static final Integer MS_IN_SEC = 1000; - private Revision() { + private Revision() + { - } + } - /** - * Compress time from long- to the integer-format: reduce the resolution to - * "seconds" and zero time to 1th January 2000 - * - * @param date date/time in the long format - * @return date/time in the compressed integer format - */ - public static int compressTime(long date) { - Long lowResolutionDate = (date - TIME_ZERO) / MS_IN_SEC; - return lowResolutionDate.intValue(); - } + /** + * Compress time from long- to the integer-format: reduce the resolution to "seconds" and zero + * time to 1th January 2000 + * + * @param date + * date/time in the long format + * @return date/time in the compressed integer format + */ + public static int compressTime(long date) + { + Long lowResolutionDate = (date - TIME_ZERO) / MS_IN_SEC; + return lowResolutionDate.intValue(); + } - /** - * Extract time, that was compressed with - * {@link Revision#compressTime(long)} - * - * @param compressedDate compressed date/time in the integer format - * @return date/time in the long format - */ - public static long extractTime(int compressedDate) { - return (long) compressedDate * MS_IN_SEC + TIME_ZERO; - } + /** + * Extract time, that was compressed with {@link Revision#compressTime(long)} + * + * @param compressedDate + * compressed date/time in the integer format + * @return date/time in the long format + */ + public static long extractTime(int compressedDate) + { + return (long) compressedDate * MS_IN_SEC + TIME_ZERO; + } - /** - * Merge two unsigned integer values (text id and time stamp) to one long - * value (revision) to use GNU Trove container. - */ - public static long createRevision(int textId, int timestamp) { - return (long) textId << 32 | (long) timestamp; - } + /** + * Merge two unsigned integer values (text id and time stamp) to one long value (revision) to + * use GNU Trove container. + */ + public static long createRevision(int textId, int timestamp) + { + return (long) textId << 32 | (long) timestamp; + } - /** - * Extract a time stamp from the revision long. - * - * @return time stamp - */ - public static int getTimestamp(long revision) { - return (int) (revision & 0x00000000FFFFFFFFL); - } + /** + * Extract a time stamp from the revision long. + * + * @return time stamp + */ + public static int getTimestamp(long revision) + { + return (int) (revision & 0x00000000FFFFFFFFL); + } - /** - * Extract a text ID from the revision long - * - * @return text ID - */ - public static int getTextId(long revision) { - return (int) (revision >>> 32); - } + /** + * Extract a text ID from the revision long + * + * @return text ID + */ + public static int getTextId(long revision) + { + return (int) (revision >>> 32); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java index 2474c223..da057e58 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/SettingsXML.java @@ -32,89 +32,93 @@ * This is a utility class that generates a template for the configuration file
* The template must be edited prior to be used for the DBMapping tool.
*/ -public class SettingsXML { +public class SettingsXML +{ + + public static final String OUTPUT_DIRECTORY = "outputDirectory"; + public static final String CATEGORY_LINKS_FILE = "categoryLinksFile"; + public static final String PAGE_LINKS_FILE = "pageLinksFile"; + public static final String META_HISTORY_FILE = "metaHistoryFile"; + public static final String EACH = "each"; + public static final String TO_TIMESTAMP = "toTimestamp"; + public static final String FROM_TIMESTAMP = "fromTimestamp"; + public static final String DISAMBIGUATION_CATEGORY = "disambiguationCategory"; + public static final String MAIN_CATEGORY = "mainCategory"; + public static final String LANGUAGE = "language"; + + private static final String DESCRIPTION = "This a configuration formular for the DBMapping Tool of the JWPL"; + private static final String PLACEHOLDER = "to be edited"; + + public static void generateSample(String outputFileName) throws IOException + { + + Properties p = new Properties(); + p.put(LANGUAGE, PLACEHOLDER); + p.put(MAIN_CATEGORY, PLACEHOLDER); + p.put(DISAMBIGUATION_CATEGORY, PLACEHOLDER); + p.put(FROM_TIMESTAMP, PLACEHOLDER); + p.put(TO_TIMESTAMP, PLACEHOLDER); + p.put(EACH, PLACEHOLDER); + p.put(META_HISTORY_FILE, PLACEHOLDER); + p.put(PAGE_LINKS_FILE, PLACEHOLDER); + p.put(CATEGORY_LINKS_FILE, PLACEHOLDER); + p.put(OUTPUT_DIRECTORY, PLACEHOLDER); + p.storeToXML(new BufferedOutputStream(new FileOutputStream(outputFileName)), DESCRIPTION); - public static final String OUTPUT_DIRECTORY = "outputDirectory"; - public static final String CATEGORY_LINKS_FILE = "categoryLinksFile"; - public static final String PAGE_LINKS_FILE = "pageLinksFile"; - public static final String META_HISTORY_FILE = "metaHistoryFile"; - public static final String EACH = "each"; - public static final String TO_TIMESTAMP = "toTimestamp"; - public static final String FROM_TIMESTAMP = "fromTimestamp"; - public static final String DISAMBIGUATION_CATEGORY = "disambiguationCategory"; - public static final String MAIN_CATEGORY = "mainCategory"; - public static final String LANGUAGE = "language"; - - private static final String DESCRIPTION = "This a configuration formular for the DBMapping Tool of the JWPL"; - private static final String PLACEHOLDER = "to be edited"; - - - public static void generateSample(String outputFileName) throws IOException { - - Properties p = new Properties(); - p.put(LANGUAGE, PLACEHOLDER); - p.put(MAIN_CATEGORY, PLACEHOLDER); - p.put(DISAMBIGUATION_CATEGORY, PLACEHOLDER); - p.put(FROM_TIMESTAMP, PLACEHOLDER); - p.put(TO_TIMESTAMP, PLACEHOLDER); - p.put(EACH, PLACEHOLDER); - p.put(META_HISTORY_FILE, PLACEHOLDER); - p.put(PAGE_LINKS_FILE, PLACEHOLDER); - p.put(CATEGORY_LINKS_FILE, PLACEHOLDER); - p.put(OUTPUT_DIRECTORY, PLACEHOLDER); - p.storeToXML(new BufferedOutputStream(new FileOutputStream(outputFileName)), DESCRIPTION); - - } - - public static Configuration loadConfiguration(String configFile, ILogger logger) { - Configuration result; - try { - result = new Configuration(logger); - Properties properties = new Properties(); - properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); - - result.setLanguage(properties.get(LANGUAGE).toString()); - result.setMainCategory(properties.get(MAIN_CATEGORY).toString()); - result.setDisambiguationCategory(properties.get( - DISAMBIGUATION_CATEGORY).toString()); - result.setFromTimestamp(TimestampUtil.parse(properties.get( - FROM_TIMESTAMP).toString())); - result.setToTimestamp(TimestampUtil.parse(properties.get( - TO_TIMESTAMP).toString())); - result.setEach(Integer.parseInt(properties.get(EACH).toString())); - } catch (Exception e) { - result = null; } - return result; - } - - public static TimeMachineFiles loadFiles(String configFile, ILogger logger) { - TimeMachineFiles result; - try { - Properties properties = new Properties(); - properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); - result = new TimeMachineFiles(logger); - result.setMetaHistoryFile(properties.get(META_HISTORY_FILE).toString()); - result.setPageLinksFile(properties.get(PAGE_LINKS_FILE).toString()); - result.setCategoryLinksFile(properties.get(CATEGORY_LINKS_FILE).toString()); - result.setOutputDirectory(properties.get(OUTPUT_DIRECTORY).toString()); - } catch (Exception e) { - logger.log("Could not load config file " + configFile); - result = null; + public static Configuration loadConfiguration(String configFile, ILogger logger) + { + Configuration result; + try { + result = new Configuration(logger); + Properties properties = new Properties(); + properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); + + result.setLanguage(properties.get(LANGUAGE).toString()); + result.setMainCategory(properties.get(MAIN_CATEGORY).toString()); + result.setDisambiguationCategory(properties.get(DISAMBIGUATION_CATEGORY).toString()); + result.setFromTimestamp(TimestampUtil.parse(properties.get(FROM_TIMESTAMP).toString())); + result.setToTimestamp(TimestampUtil.parse(properties.get(TO_TIMESTAMP).toString())); + result.setEach(Integer.parseInt(properties.get(EACH).toString())); + } + catch (Exception e) { + result = null; + } + return result; } - return result; - } - public static void main(String[] args) { - if (args.length > 0) { - try { - generateSample(args[0]); - } catch (IOException e) { - e.printStackTrace(); - } + public static TimeMachineFiles loadFiles(String configFile, ILogger logger) + { + TimeMachineFiles result; + try { + Properties properties = new Properties(); + properties.loadFromXML(new BufferedInputStream(new FileInputStream(configFile))); + result = new TimeMachineFiles(logger); + + result.setMetaHistoryFile(properties.get(META_HISTORY_FILE).toString()); + result.setPageLinksFile(properties.get(PAGE_LINKS_FILE).toString()); + result.setCategoryLinksFile(properties.get(CATEGORY_LINKS_FILE).toString()); + result.setOutputDirectory(properties.get(OUTPUT_DIRECTORY).toString()); + } + catch (Exception e) { + logger.log("Could not load config file " + configFile); + result = null; + } + return result; } - } + public static void main(String[] args) + { + if (args.length > 0) { + try { + generateSample(args[0]); + } + catch (IOException e) { + e.printStackTrace(); + } + } + + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java index a108d7ad..2ee0be01 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineFiles.java @@ -24,81 +24,98 @@ import org.dkpro.jwpl.wikimachine.domain.Files; import org.dkpro.jwpl.wikimachine.util.TimestampUtil; -public class TimeMachineFiles extends Files { - - private static final String NO_CATEGORYLINKS = "category links file not found"; - private static final String NO_METAHISTORY = "meta history file not found"; - private static final String NO_PAGELINKS = "page links file not found"; - - private String metaHistoryFile; - private String pageLinksFile; - private String categoryLinksFile; - private String timeStamp = ""; - - public TimeMachineFiles(ILogger logger) { - super(logger); - } - - public TimeMachineFiles(TimeMachineFiles files) { - super(files); - this.metaHistoryFile = files.metaHistoryFile; - this.pageLinksFile = files.metaHistoryFile; - this.categoryLinksFile = files.categoryLinksFile; - } - - /** - * Add a sub-directory called "timestamp" to the current output directory - * - * @param timestamp - name of a new sub-directory - */ - public void setTimestamp(Timestamp timestamp) { - - timeStamp = TimestampUtil.toMediaWikiString(timestamp) + File.separator; - } - - public String getMetaHistoryFile() { - return metaHistoryFile; - } - - public void setMetaHistoryFile(String metaHistroyFile) { - this.metaHistoryFile = metaHistroyFile; - } - - public String getPageLinksFile() { - return pageLinksFile; - } - - public void setPageLinksFile(String pageLinksFile) { - this.pageLinksFile = pageLinksFile; - } - - public String getCategoryLinksFile() { - return categoryLinksFile; - } - - public void setCategoryLinksFile(String categoryLinksFile) { - this.categoryLinksFile = categoryLinksFile; - } - - public boolean checkInputFile(String fileName, String errorMessage) { - File inputFile = new File(fileName); - boolean result = inputFile.exists() && inputFile.canRead(); - if (!result) { - logger.log(errorMessage); +public class TimeMachineFiles + extends Files +{ + + private static final String NO_CATEGORYLINKS = "category links file not found"; + private static final String NO_METAHISTORY = "meta history file not found"; + private static final String NO_PAGELINKS = "page links file not found"; + + private String metaHistoryFile; + private String pageLinksFile; + private String categoryLinksFile; + private String timeStamp = ""; + + public TimeMachineFiles(ILogger logger) + { + super(logger); + } + + public TimeMachineFiles(TimeMachineFiles files) + { + super(files); + this.metaHistoryFile = files.metaHistoryFile; + this.pageLinksFile = files.metaHistoryFile; + this.categoryLinksFile = files.categoryLinksFile; + } + + /** + * Add a sub-directory called "timestamp" to the current output directory + * + * @param timestamp + * - name of a new sub-directory + */ + public void setTimestamp(Timestamp timestamp) + { + + timeStamp = TimestampUtil.toMediaWikiString(timestamp) + File.separator; + } + + public String getMetaHistoryFile() + { + return metaHistoryFile; + } + + public void setMetaHistoryFile(String metaHistroyFile) + { + this.metaHistoryFile = metaHistroyFile; + } + + public String getPageLinksFile() + { + return pageLinksFile; + } + + public void setPageLinksFile(String pageLinksFile) + { + this.pageLinksFile = pageLinksFile; + } + + public String getCategoryLinksFile() + { + return categoryLinksFile; + } + + public void setCategoryLinksFile(String categoryLinksFile) + { + this.categoryLinksFile = categoryLinksFile; + } + + public boolean checkInputFile(String fileName, String errorMessage) + { + File inputFile = new File(fileName); + boolean result = inputFile.exists() && inputFile.canRead(); + if (!result) { + logger.log(errorMessage); + } + return result; + } + + @Override + protected String getOutputPath(String fileName) + { + File outputSubDirectory = new File( + outputDirectory.getAbsolutePath() + File.separator + timeStamp); + outputSubDirectory.mkdir(); + return outputDirectory.getAbsolutePath() + File.separator + timeStamp + fileName; + } + + @Override + public boolean checkAll() + { + return checkOutputDirectory() && checkInputFile(metaHistoryFile, NO_METAHISTORY) + && checkInputFile(pageLinksFile, NO_PAGELINKS) + && checkInputFile(categoryLinksFile, NO_CATEGORYLINKS); } - return result; - } - - @Override - protected String getOutputPath(String fileName) { - File outputSubDirectory = new File(outputDirectory.getAbsolutePath() + File.separator + timeStamp); - outputSubDirectory.mkdir(); - return outputDirectory.getAbsolutePath() + File.separator + timeStamp + fileName; - } - - @Override - public boolean checkAll() { - return checkOutputDirectory() && checkInputFile(metaHistoryFile, NO_METAHISTORY) - && checkInputFile(pageLinksFile, NO_PAGELINKS) && checkInputFile(categoryLinksFile, NO_CATEGORYLINKS); - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java index 0a027295..316ef74c 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/domain/TimeMachineGenerator.java @@ -36,159 +36,173 @@ import org.dkpro.jwpl.wikimachine.util.TimestampUtil; /** - * Generate dumps as .txt files for the JWPL database from given MediaWiki dump - * files.
- * By specifying a 'from' and a 'to' time stamps and the number of days to take - * as interval
+ * Generate dumps as .txt files for the JWPL database from given MediaWiki dump files.
+ * By specifying a 'from' and a 'to' time stamps and the number of days to take as interval
* this class produces multiple dump versions. */ -public class TimeMachineGenerator extends AbstractSnapshotGenerator { +public class TimeMachineGenerator + extends AbstractSnapshotGenerator +{ - private IDumpVersion[] versions = null; - private TimeMachineFiles initialFiles = null; + private IDumpVersion[] versions = null; + private TimeMachineFiles initialFiles = null; - public TimeMachineGenerator(IEnvironmentFactory environmentFactory) { - super(environmentFactory); - } + public TimeMachineGenerator(IEnvironmentFactory environmentFactory) + { + super(environmentFactory); + } - @Override - public void setFiles(Files files) { - initialFiles = (TimeMachineFiles) files; - } + @Override + public void setFiles(Files files) + { + initialFiles = (TimeMachineFiles) files; + } - private Integer calculateSnapshotsCount(Timestamp from, Timestamp to, - Integer dayInterval) { - Integer result = 0; + private Integer calculateSnapshotsCount(Timestamp from, Timestamp to, Integer dayInterval) + { + Integer result = 0; - for (Timestamp i = from; i.before(to); i = TimestampUtil.getNextTimestamp(i, dayInterval)) { - result++; - } + for (Timestamp i = from; i.before(to); i = TimestampUtil.getNextTimestamp(i, dayInterval)) { + result++; + } - return result; - } + return result; + } - @Override - public void start() throws Exception { + @Override + public void start() throws Exception + { - Timestamp fromTimestamp = configuration.getFromTimestamp(); - Timestamp toTimestamp = configuration.getToTimestamp(); - int each = configuration.getEach(); + Timestamp fromTimestamp = configuration.getFromTimestamp(); + Timestamp toTimestamp = configuration.getToTimestamp(); + int each = configuration.getEach(); - int snapshotsCount = fromTimestamp.equals(toTimestamp) ? 1 - : calculateSnapshotsCount(fromTimestamp, toTimestamp, each); + int snapshotsCount = fromTimestamp.equals(toTimestamp) ? 1 + : calculateSnapshotsCount(fromTimestamp, toTimestamp, each); - if (snapshotsCount > 0) { + if (snapshotsCount > 0) { - versions = new IDumpVersion[snapshotsCount]; - logger.log("Dumps to be generated:"); + versions = new IDumpVersion[snapshotsCount]; + logger.log("Dumps to be generated:"); - for (int i = 0; i < snapshotsCount; i++) { + for (int i = 0; i < snapshotsCount; i++) { - Timestamp currentTimestamp = TimestampUtil.getNextTimestamp(fromTimestamp, i * each); - logger.log(currentTimestamp); + Timestamp currentTimestamp = TimestampUtil.getNextTimestamp(fromTimestamp, + i * each); + logger.log(currentTimestamp); - MetaData commonMetaData = MetaData.initWithConfig(configuration); - commonMetaData.setTimestamp(currentTimestamp); + MetaData commonMetaData = MetaData.initWithConfig(configuration); + commonMetaData.setTimestamp(currentTimestamp); - IDumpVersion version = environmentFactory.getDumpVersion(); + IDumpVersion version = environmentFactory.getDumpVersion(); - version.initialize(currentTimestamp); - version.setMetaData(commonMetaData); - TimeMachineFiles currentFiles = new TimeMachineFiles( - initialFiles); - currentFiles.setTimestamp(currentTimestamp); - version.setFiles(currentFiles); - versions[i] = version; - } + version.initialize(currentTimestamp); + version.setMetaData(commonMetaData); + TimeMachineFiles currentFiles = new TimeMachineFiles(initialFiles); + currentFiles.setTimestamp(currentTimestamp); + version.setFiles(currentFiles); + versions[i] = version; + } - processInputDumps(); + processInputDumps(); - } else { - logger.log("No timestamps."); + } + else { + logger.log("No timestamps."); + } } - } - private void processInputDumps() throws IOException { + private void processInputDumps() throws IOException + { - dumpVersionProcessor.setDumpVersions(versions); + dumpVersionProcessor.setDumpVersions(versions); - logger.log("Processing the revision table"); - dumpVersionProcessor.processRevision(createRevisionParser()); + logger.log("Processing the revision table"); + dumpVersionProcessor.processRevision(createRevisionParser()); - logger.log("Processing the page table"); - dumpVersionProcessor.processPage(createPageParser()); + logger.log("Processing the page table"); + dumpVersionProcessor.processPage(createPageParser()); - logger.log("Processing the categorylinks table"); - dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); + logger.log("Processing the categorylinks table"); + dumpVersionProcessor.processCategorylinks(createCategorylinksParser()); - logger.log("Processing the pagelinks table"); - dumpVersionProcessor.processPagelinks(createPagelinksParser()); + logger.log("Processing the pagelinks table"); + dumpVersionProcessor.processPagelinks(createPagelinksParser()); - logger.log("Processing the text table"); - dumpVersionProcessor.processText(createTextParser()); + logger.log("Processing the text table"); + dumpVersionProcessor.processText(createTextParser()); - logger.log("Writing meta data"); - dumpVersionProcessor.writeMetaData(); - } + logger.log("Writing meta data"); + dumpVersionProcessor.writeMetaData(); + } - private RevisionParser createRevisionParser() throws IOException { + private RevisionParser createRevisionParser() throws IOException + { - String metahistory = initialFiles.getMetaHistoryFile(); + String metahistory = initialFiles.getMetaHistoryFile(); - DumpTableInputStream revisionTableInputStream = environmentFactory.getDumpTableInputStream(); - revisionTableInputStream.initialize(decompressor.getInputStream(metahistory), DumpTableEnum.REVISION); + DumpTableInputStream revisionTableInputStream = environmentFactory + .getDumpTableInputStream(); + revisionTableInputStream.initialize(decompressor.getInputStream(metahistory), + DumpTableEnum.REVISION); - RevisionParser revisionParser = environmentFactory.getRevisionParser(); - revisionParser.setInputStream(revisionTableInputStream); + RevisionParser revisionParser = environmentFactory.getRevisionParser(); + revisionParser.setInputStream(revisionTableInputStream); - return revisionParser; + return revisionParser; - } + } - private PageParser createPageParser() throws IOException { + private PageParser createPageParser() throws IOException + { - String metahistory = initialFiles.getMetaHistoryFile(); + String metahistory = initialFiles.getMetaHistoryFile(); - DumpTableInputStream pageTableInputStream = environmentFactory.getDumpTableInputStream(); - pageTableInputStream.initialize(decompressor.getInputStream(metahistory), DumpTableEnum.PAGE); + DumpTableInputStream pageTableInputStream = environmentFactory.getDumpTableInputStream(); + pageTableInputStream.initialize(decompressor.getInputStream(metahistory), + DumpTableEnum.PAGE); - PageParser pageParser = environmentFactory.getPageParser(); - pageParser.setInputStream(pageTableInputStream); + PageParser pageParser = environmentFactory.getPageParser(); + pageParser.setInputStream(pageTableInputStream); - return pageParser; + return pageParser; - } + } - private CategorylinksParser createCategorylinksParser() throws IOException { + private CategorylinksParser createCategorylinksParser() throws IOException + { - String categorylinks = initialFiles.getCategoryLinksFile(); - InputStream categorylinksStream = decompressor.getInputStream(categorylinks); + String categorylinks = initialFiles.getCategoryLinksFile(); + InputStream categorylinksStream = decompressor.getInputStream(categorylinks); - return new CategorylinksParser(categorylinksStream); + return new CategorylinksParser(categorylinksStream); - } + } - private PagelinksParser createPagelinksParser() throws IOException { + private PagelinksParser createPagelinksParser() throws IOException + { - String pagelinks = initialFiles.getPageLinksFile(); + String pagelinks = initialFiles.getPageLinksFile(); - InputStream pagelinksStream = decompressor.getInputStream(pagelinks); - return new PagelinksParser(pagelinksStream); + InputStream pagelinksStream = decompressor.getInputStream(pagelinks); + return new PagelinksParser(pagelinksStream); - } + } - private TextParser createTextParser() throws IOException { + private TextParser createTextParser() throws IOException + { - String metahistory = initialFiles.getMetaHistoryFile(); + String metahistory = initialFiles.getMetaHistoryFile(); - DumpTableInputStream textTableInputStream = environmentFactory.getDumpTableInputStream(); - textTableInputStream.initialize(decompressor.getInputStream(metahistory), DumpTableEnum.TEXT); + DumpTableInputStream textTableInputStream = environmentFactory.getDumpTableInputStream(); + textTableInputStream.initialize(decompressor.getInputStream(metahistory), + DumpTableEnum.TEXT); - TextParser textParser = environmentFactory.getTextParser(); - textParser.setInputStream(textTableInputStream); + TextParser textParser = environmentFactory.getTextParser(); + textParser.setInputStream(textTableInputStream); - return textParser; + return textParser; - } + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java index d77cd08a..46fbd022 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionFastUtilIntKey.java @@ -37,313 +37,340 @@ import it.unimi.dsi.fastutil.ints.IntArraySet; import it.unimi.dsi.fastutil.ints.IntSet; -public class DumpVersionFastUtilIntKey extends AbstractDumpVersion { - private static final String SQL_NULL = "NULL"; - /** - * maps page id's to Revision objects - */ - private HashMap pageIdRevMap; - /** - * after revision parsing the map will be erased and the keys sorted in the - * array list - */ - private IntSet pageIdRevList; - - /** - * caches the page id's of disambiguation pages. - */ - private IntSet disambiguations; - /** - * maps text id's to the page id's. - */ - private Int2IntOpenHashMap textIdPageIdMap; - /** - * maps page id's of pages to their names - */ - private Map pPageIdNameMap; - /** - * maps names of pages to their page id's. - */ - private Int2IntOpenHashMap pNamePageIdMap; - - /** - * maps names of categories to their page id's. - */ - private Int2IntOpenHashMap cNamePageIdMap; - - /** - * maps page id's of redirects to their names. - */ - private Map rPageIdNameMap; - - @Override - public void freeAfterCategoryLinksParsing() { - String message = "clearing cNamePageIdMap of " + cNamePageIdMap.size() + " objects"; - logger.log(message); - cNamePageIdMap.clear(); - } - - @Override - public void freeAfterPageLinksParsing() { - // nothing to free - } - - @Override - public void freeAfterPageParsing() { - logger.log("clearing pageIdRevSet of " + pageIdRevList.size() + " objects"); - pageIdRevList.clear(); - } - - @Override - public void freeAfterRevisonParsing() { - pageIdRevList = new IntArraySet(pageIdRevMap.keySet().size()); - for (int key : pageIdRevMap.keySet()) { - pageIdRevList.add(key); - } - - pageIdRevMap.clear(); - } - - @Override - public void freeAfterTextParsing() { - pageIdRevMap.clear(); - pageIdRevList.clear(); - disambiguations.clear(); - textIdPageIdMap.clear(); - pPageIdNameMap.clear(); - pNamePageIdMap.clear(); - cNamePageIdMap.clear(); - rPageIdNameMap.clear(); - } - - @Override - public void initialize(Timestamp timestamp) { - this.timestamp = Revision.compressTime(timestamp.getTime()); - - /* - * filled in revisions +public class DumpVersionFastUtilIntKey + extends AbstractDumpVersion +{ + private static final String SQL_NULL = "NULL"; + /** + * maps page id's to Revision objects + */ + private HashMap pageIdRevMap; + /** + * after revision parsing the map will be erased and the keys sorted in the array list */ - pageIdRevMap = new HashMap<>(); - textIdPageIdMap = new Int2IntOpenHashMap(); + private IntSet pageIdRevList; - /* - * filled in pages + /** + * caches the page id's of disambiguation pages. + */ + private IntSet disambiguations; + /** + * maps text id's to the page id's. + */ + private Int2IntOpenHashMap textIdPageIdMap; + /** + * maps page id's of pages to their names + */ + private Map pPageIdNameMap; + /** + * maps names of pages to their page id's. */ - pPageIdNameMap = new HashMap<>(); - pNamePageIdMap = new Int2IntOpenHashMap(); + private Int2IntOpenHashMap pNamePageIdMap; - cNamePageIdMap = new Int2IntOpenHashMap(); - rPageIdNameMap = new HashMap<>(); + /** + * maps names of categories to their page id's. + */ + private Int2IntOpenHashMap cNamePageIdMap; - /* - * filled in categories + /** + * maps page id's of redirects to their names. */ - disambiguations = new IntArraySet(); - } - - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { - String cl_to_text = clParser.getClTo(); - if (cl_to_text != null) { - int cl_to_textHashcode = cl_to_text.hashCode(); - // if category exists - - if (cNamePageIdMap.containsKey(cl_to_textHashcode)) { - int cl_to = cNamePageIdMap.get(cl_to_textHashcode); - // if the link source is a page then write the link in - // category_pages and page_categories - int cl_from = clParser.getClFrom(); - // if exists page - if (pPageIdNameMap.containsKey(cl_from)) { - processCategoryLinksRowPageExists(cl_from, cl_to, - cl_to_text); - } else { - processCategoryLinksRowPageMiss(cl_from, cl_to); + private Map rPageIdNameMap; + + @Override + public void freeAfterCategoryLinksParsing() + { + String message = "clearing cNamePageIdMap of " + cNamePageIdMap.size() + " objects"; + logger.log(message); + cNamePageIdMap.clear(); + } + + @Override + public void freeAfterPageLinksParsing() + { + // nothing to free + } + + @Override + public void freeAfterPageParsing() + { + logger.log("clearing pageIdRevSet of " + pageIdRevList.size() + " objects"); + pageIdRevList.clear(); + } + + @Override + public void freeAfterRevisonParsing() + { + pageIdRevList = new IntArraySet(pageIdRevMap.keySet().size()); + for (int key : pageIdRevMap.keySet()) { + pageIdRevList.add(key); } - } + + pageIdRevMap.clear(); } - } - private void processCategoryLinksRowPageExists(Integer cl_from, Integer cl_to, String cl_to_text) throws IOException { + @Override + public void freeAfterTextParsing() + { + pageIdRevMap.clear(); + pageIdRevList.clear(); + disambiguations.clear(); + textIdPageIdMap.clear(); + pPageIdNameMap.clear(); + pNamePageIdMap.clear(); + cNamePageIdMap.clear(); + rPageIdNameMap.clear(); + } + + @Override + public void initialize(Timestamp timestamp) + { + this.timestamp = Revision.compressTime(timestamp.getTime()); + + /* + * filled in revisions + */ + pageIdRevMap = new HashMap<>(); + textIdPageIdMap = new Int2IntOpenHashMap(); + + /* + * filled in pages + */ + pPageIdNameMap = new HashMap<>(); + pNamePageIdMap = new Int2IntOpenHashMap(); + + cNamePageIdMap = new Int2IntOpenHashMap(); + rPageIdNameMap = new HashMap<>(); + + /* + * filled in categories + */ + disambiguations = new IntArraySet(); + } - categoryPages.addRow(cl_to, cl_from); - pageCategories.addRow(cl_from, cl_to); - if (cl_to_text.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from.intValue()); - metaData.addDisamb(); + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException + { + String cl_to_text = clParser.getClTo(); + if (cl_to_text != null) { + int cl_to_textHashcode = cl_to_text.hashCode(); + // if category exists + + if (cNamePageIdMap.containsKey(cl_to_textHashcode)) { + int cl_to = cNamePageIdMap.get(cl_to_textHashcode); + // if the link source is a page then write the link in + // category_pages and page_categories + int cl_from = clParser.getClFrom(); + // if exists page + if (pPageIdNameMap.containsKey(cl_from)) { + processCategoryLinksRowPageExists(cl_from, cl_to, cl_to_text); + } + else { + processCategoryLinksRowPageMiss(cl_from, cl_to); + } + } + } + } + + private void processCategoryLinksRowPageExists(Integer cl_from, Integer cl_to, + String cl_to_text) + throws IOException + { + + categoryPages.addRow(cl_to, cl_from); + pageCategories.addRow(cl_from, cl_to); + if (cl_to_text.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from.intValue()); + metaData.addDisamb(); + } } - } - private void processCategoryLinksRowPageMiss(Integer cl_from, Integer cl_to) throws IOException { - // if category page id exists - if (cNamePageIdMap.containsValue(cl_from.intValue())) { - categoryOutlinks.addRow(cl_to, cl_from); - categoryInlinks.addRow(cl_from, cl_to); + private void processCategoryLinksRowPageMiss(Integer cl_from, Integer cl_to) throws IOException + { + // if category page id exists + if (cNamePageIdMap.containsValue(cl_from.intValue())) { + categoryOutlinks.addRow(cl_to, cl_from); + categoryInlinks.addRow(cl_from, cl_to); + } } - } - - @Override - public void processPageLinksRow(PagelinksParser plParser) throws IOException { - int pl_from = plParser.getPlFrom(); - String pl_to = plParser.getPlTo(); - if (pl_to != null) { - int pl_toHashcode = pl_to.hashCode(); - // if page name and page page id exists - - if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) - && pNamePageIdMap.containsKey(pl_toHashcode)) { - int id = pNamePageIdMap.get(pl_toHashcode); - pageOutlinks.addRow(pl_from, id); - pageInlinks.addRow(id, pl_from); - } + + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException + { + int pl_from = plParser.getPlFrom(); + String pl_to = plParser.getPlTo(); + if (pl_to != null) { + int pl_toHashcode = pl_to.hashCode(); + // if page name and page page id exists + + if ((!skipPage || pPageIdNameMap.containsKey(pl_from)) + && pNamePageIdMap.containsKey(pl_toHashcode)) { + int id = pNamePageIdMap.get(pl_toHashcode); + pageOutlinks.addRow(pl_from, id); + pageInlinks.addRow(id, pl_from); + } + } } - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - switch (pageParser.getPageNamespace()) { - case NS_CATEGORY: { - processPageRowCategory(pageParser); - break; - } - case NS_MAIN: { - processPageRowPage(pageParser); - break; - } + + @Override + public void processPageRow(PageParser pageParser) throws IOException + { + switch (pageParser.getPageNamespace()) { + case NS_CATEGORY: { + processPageRowCategory(pageParser); + break; + } + case NS_MAIN: { + processPageRowPage(pageParser); + break; + } + } } - } - - private void processPageRowCategory(PageParser pageParser) throws IOException { - if (!(skipCategory && pageParser.getPageIsRedirect())) { - // retrieve page id and page title - int page_id = pageParser.getPageId(); - // ignore categories, which have no revisions before the time stamp - String page_title = pageParser.getPageTitle(); - if (page_title != null && pageIdRevList.contains(page_id)) { - // cache the retrieved values - // record category - cNamePageIdMap.put(page_title.hashCode(), page_id); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - metaData.addCategory(); - } + + private void processPageRowCategory(PageParser pageParser) throws IOException + { + if (!(skipCategory && pageParser.getPageIsRedirect())) { + // retrieve page id and page title + int page_id = pageParser.getPageId(); + // ignore categories, which have no revisions before the time stamp + String page_title = pageParser.getPageTitle(); + if (page_title != null && pageIdRevList.contains(page_id)) { + // cache the retrieved values + // record category + cNamePageIdMap.put(page_title.hashCode(), page_id); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + metaData.addCategory(); + } + } } - } - - private void processPageRowPage(PageParser pageParser) throws IOException { - // retrieve page id and title - int page_id = pageParser.getPageId(); - // ignore pages, which have no revisions prior to the time stamp - String page_title = pageParser.getPageTitle(); - if (page_title != null && pageIdRevList.contains(page_id)) { - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - // record redirect - rPageIdNameMap.put(page_id, page_title); - } else { - // record page - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_title.hashCode(), page_id); - } + + private void processPageRowPage(PageParser pageParser) throws IOException + { + // retrieve page id and title + int page_id = pageParser.getPageId(); + // ignore pages, which have no revisions prior to the time stamp + String page_title = pageParser.getPageTitle(); + if (page_title != null && pageIdRevList.contains(page_id)) { + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + // record redirect + rPageIdNameMap.put(page_id, page_title); + } + else { + // record page + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_title.hashCode(), page_id); + } + } } - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - // get the time stamp of the revision - int rev_timestamp = revisionParser.getRevTimestamp(); - if (rev_timestamp < timestamp) { - // get the rev_page (corresponds to page_id in the table page) - int rev_page = revisionParser.getRevPage(); - if (pageIdRevMap.containsKey(rev_page)) { - processRevisionRowContainsKey(revisionParser, rev_page, - rev_timestamp); - } else { - processRevisionRowMissKey(revisionParser, rev_page, - rev_timestamp); - } + + @Override + public void processRevisionRow(RevisionParser revisionParser) + { + // get the time stamp of the revision + int rev_timestamp = revisionParser.getRevTimestamp(); + if (rev_timestamp < timestamp) { + // get the rev_page (corresponds to page_id in the table page) + int rev_page = revisionParser.getRevPage(); + if (pageIdRevMap.containsKey(rev_page)) { + processRevisionRowContainsKey(revisionParser, rev_page, rev_timestamp); + } + else { + processRevisionRowMissKey(revisionParser, rev_page, rev_timestamp); + } + } + } - } + private void processRevisionRowContainsKey(RevisionParser revisionParser, int rev_page, + int rev_timestamp) + { - private void processRevisionRowContainsKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { + long revisionRecord = pageIdRevMap.get(rev_page); + int old_timestamp = Revision.getTimestamp(revisionRecord); - long revisionRecord = pageIdRevMap.get(rev_page); - int old_timestamp = Revision.getTimestamp(revisionRecord); + // is it a better time stamp ? + if (rev_timestamp > old_timestamp) { + int old_text_id = Revision.getTextId(revisionRecord); + pageIdRevMap.put(rev_page, + Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); + textIdPageIdMap.remove(old_text_id); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + } - // is it a better time stamp ? - if (rev_timestamp > old_timestamp) { - int old_text_id = Revision.getTextId(revisionRecord); - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser - .getRevTextId(), rev_timestamp)); - textIdPageIdMap.remove(old_text_id); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + private void processRevisionRowMissKey(RevisionParser revisionParser, int rev_page, + int rev_timestamp) + { + // this is the first recorded time stamp for that page id + pageIdRevMap.put(rev_page, + Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); } - } - - private void processRevisionRowMissKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { - // this is the first recorded time stamp for that page id - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - int text_id = textParser.getOldId(); - - if (textIdPageIdMap.containsKey(text_id)) { - int page_id = textIdPageIdMap.get(text_id); - // if exists page page id -> page - if (pPageIdNameMap.containsKey(page_id)) { - processTextRowPage(textParser, page_id); - } else if (rPageIdNameMap.containsKey(page_id)) { - // if exists redirect -> redirect - processTextRowRedirect(textParser, page_id); - } + + @Override + public void processTextRow(TextParser textParser) throws IOException + { + int text_id = textParser.getOldId(); + + if (textIdPageIdMap.containsKey(text_id)) { + int page_id = textIdPageIdMap.get(text_id); + // if exists page page id -> page + if (pPageIdNameMap.containsKey(page_id)) { + processTextRowPage(textParser, page_id); + } + else if (rPageIdNameMap.containsKey(page_id)) { + // if exists redirect -> redirect + processTextRowRedirect(textParser, page_id); + } + } } - } - private void processTextRowPage(TextParser textParser, int page_id) throws IOException { - // get page name - String pageName = pPageIdNameMap.get(page_id); + private void processTextRowPage(TextParser textParser, int page_id) throws IOException + { + // get page name + String pageName = pPageIdNameMap.get(page_id); + + page.addRow(page_id, page_id, pageName, textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); - page.addRow(page_id, page_id, pageName, textParser.getOldText(), - formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); + metaData.addPage(); + } - pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); - metaData.addPage(); - } + private void processTextRowRedirect(TextParser textParser, int page_id) throws IOException + { + String destination = Redirects.getRedirectDestination(textParser.getOldText()); - private void processTextRowRedirect(TextParser textParser, int page_id) - throws IOException { - String destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (destination != null) { + // if page name exists - if (destination != null) { - // if page name exists + int destinationHashcode = destination.hashCode(); - int destinationHashcode = destination.hashCode(); + if (pNamePageIdMap.containsKey(destinationHashcode)) { + int id = pNamePageIdMap.get(destinationHashcode); + String redirectName = rPageIdNameMap.get(page_id); + pageRedirects.addRow(id, redirectName); + pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, SQL_NULL); + metaData.addRedirect(); + } + } + } - if (pNamePageIdMap.containsKey(destinationHashcode)) { - int id = pNamePageIdMap.get(destinationHashcode); - String redirectName = rPageIdNameMap.get(page_id); - pageRedirects.addRow(id, redirectName); - pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, SQL_NULL); - metaData.addRedirect(); - } + @Override + public void writeMetaData() throws IOException + { + TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp + outputFile.addRow(metaData.getId(), metaData.getLanguage(), + metaData.getDisambiguationCategory(), metaData.getMainCategory(), + metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), + TimestampUtil.toMediaWikiString(metaData.getTimestamp())); + outputFile.flush(); + outputFile.close(); } - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp - outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), - metaData.getMainCategory(), metaData.getNrOfPages(), metaData.getNrOfRedirects(), - metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), - TimestampUtil.toMediaWikiString(metaData.getTimestamp())); - outputFile.flush(); - outputFile.close(); - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java index ebbef69b..6d464a61 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKGeneric.java @@ -38,336 +38,366 @@ import org.dkpro.jwpl.wikimachine.util.TxtFileWriter; /** - * Please be sure, that {@link HashAlgorithm#hashCode(String)} of the provided HashAlgorithm type returns the - * same type as KeyType + * Please be sure, that {@link HashAlgorithm#hashCode(String)} of the provided HashAlgorithm type + * returns the same type as KeyType * - * @param the type of the HashMap's key - * @param hashing algorithm, returning KeyType
+ * @param + * the type of the HashMap's key + * @param + * hashing algorithm, returning KeyType
*/ -public class DumpVersionJDKGeneric extends AbstractDumpVersion { - - private static final String SQL_NULL = "NULL"; - - /** - * maps page id's to Revision objects - */ - private HashMap pageIdRevMap; - /** - * after revision parsing the map will be erased and the keys sorted in the - * array list - */ - private Set pageIdRevList; - - /** - * caches the page id's of disambiguation pages. - */ - private Set disambiguations; - /** - * maps text id's to the page id's. - */ - private Map textIdPageIdMap; - /** - * maps page id's of pages to their names - */ - private Map pPageIdNameMap; - /** - * maps names of pages to their page id's. - */ - private Map pNamePageIdMap; - - /** - * maps names of categories to their page id's. - */ - private Map cNamePageIdMap; - - /** - * maps page id's of redirects to their names. - */ - private Map rPageIdNameMap; - - private final IStringHashCode hashAlgorithm; - - @SuppressWarnings("unchecked") - public DumpVersionJDKGeneric(Class hashAlgorithmClass) - throws InstantiationException, IllegalAccessException, NoSuchMethodException, InvocationTargetException { - - hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); - @SuppressWarnings("unused") - KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); - } - - @Override - public void freeAfterCategoryLinksParsing() { - logger.log("clearing cNamePageIdMap of " + cNamePageIdMap.size() + " objects"); - cNamePageIdMap.clear(); - } - - @Override - public void freeAfterPageLinksParsing() { - // nothing to free - } - - @Override - public void freeAfterPageParsing() { - logger.log("clearing pageIdRevSet of " + pageIdRevList.size() + " objects"); - pageIdRevList.clear(); - } - - @Override - public void freeAfterRevisonParsing() { - pageIdRevList = new HashSet<>(pageIdRevMap.keySet().size()); - pageIdRevList.addAll(pageIdRevMap.keySet()); - pageIdRevMap.clear(); - } - - @Override - public void freeAfterTextParsing() { - pageIdRevMap.clear(); - pageIdRevList.clear(); - disambiguations.clear(); - textIdPageIdMap.clear(); - pPageIdNameMap.clear(); - pNamePageIdMap.clear(); - cNamePageIdMap.clear(); - rPageIdNameMap.clear(); - } - - @Override - public void initialize(Timestamp timestamp) { - this.timestamp = Revision.compressTime(timestamp.getTime()); - - /* - * filled in revisions +public class DumpVersionJDKGeneric + extends AbstractDumpVersion +{ + + private static final String SQL_NULL = "NULL"; + + /** + * maps page id's to Revision objects + */ + private HashMap pageIdRevMap; + /** + * after revision parsing the map will be erased and the keys sorted in the array list */ - pageIdRevMap = new HashMap<>(); - textIdPageIdMap = new HashMap<>(); + private Set pageIdRevList; - /* - * filled in pages + /** + * caches the page id's of disambiguation pages. + */ + private Set disambiguations; + /** + * maps text id's to the page id's. + */ + private Map textIdPageIdMap; + /** + * maps page id's of pages to their names + */ + private Map pPageIdNameMap; + /** + * maps names of pages to their page id's. */ - pPageIdNameMap = new HashMap<>(); - pNamePageIdMap = new HashMap<>(); + private Map pNamePageIdMap; - cNamePageIdMap = new HashMap<>(); - rPageIdNameMap = new HashMap<>(); + /** + * maps names of categories to their page id's. + */ + private Map cNamePageIdMap; - /* - * filled in categories + /** + * maps page id's of redirects to their names. */ - disambiguations = new HashSet<>(); - } - - @SuppressWarnings("unchecked") - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) - throws IOException { - String cl_to_text = clParser.getClTo(); - if (cl_to_text != null) { - KeyType cl_to_textHashcode = (KeyType) hashAlgorithm.hashCode(cl_to_text); - // if category exists - - Integer cl_to = cNamePageIdMap.get(cl_to_textHashcode); - if (cl_to != null) { - // if the link source is a page then write the link in - // category_pages and page_categories - int cl_from = clParser.getClFrom(); - // if exists page - if (pPageIdNameMap.containsKey(cl_from)) { - processCategoryLinksRowPageExists(cl_from, cl_to, cl_to_text); - } else { - processCateforyLinksRowPageMiss(cl_from, cl_to); - } - } + private Map rPageIdNameMap; + + private final IStringHashCode hashAlgorithm; + + @SuppressWarnings("unchecked") + public DumpVersionJDKGeneric(Class hashAlgorithmClass) + throws InstantiationException, IllegalAccessException, NoSuchMethodException, + InvocationTargetException + { + + hashAlgorithm = hashAlgorithmClass.getDeclaredConstructor().newInstance(); + @SuppressWarnings("unused") + KeyType hashAlgorithmResult = (KeyType) hashAlgorithm.hashCode("test"); + } + + @Override + public void freeAfterCategoryLinksParsing() + { + logger.log("clearing cNamePageIdMap of " + cNamePageIdMap.size() + " objects"); + cNamePageIdMap.clear(); } - } - - private void processCategoryLinksRowPageExists(Integer cl_from, Integer cl_to, String cl_to_text) throws IOException { - categoryPages.addRow(cl_to, cl_from); - pageCategories.addRow(cl_from, cl_to); - if (cl_to_text.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from); - metaData.addDisamb(); + + @Override + public void freeAfterPageLinksParsing() + { + // nothing to free } - } - private void processCateforyLinksRowPageMiss(Integer cl_from, Integer cl_to) throws IOException { - // if category page id exists - if (cNamePageIdMap.containsValue(cl_from)) { - categoryOutlinks.addRow(cl_to, cl_from); - categoryInlinks.addRow(cl_from, cl_to); + @Override + public void freeAfterPageParsing() + { + logger.log("clearing pageIdRevSet of " + pageIdRevList.size() + " objects"); + pageIdRevList.clear(); } - } - - @SuppressWarnings("unchecked") - @Override - public void processPageLinksRow(PagelinksParser plParser) throws IOException { - int pl_from = plParser.getPlFrom(); - String pl_to = plParser.getPlTo(); - if (pl_to != null) { - KeyType pl_toHashcode = (KeyType) hashAlgorithm.hashCode(pl_to); - - // if page name and page page id exists - Integer id = pNamePageIdMap.get(pl_toHashcode); - if (id != null - && (!skipPage || pPageIdNameMap.containsKey(pl_from))) { - pageOutlinks.addRow(pl_from, id); - pageInlinks.addRow(id, pl_from); - } + + @Override + public void freeAfterRevisonParsing() + { + pageIdRevList = new HashSet<>(pageIdRevMap.keySet().size()); + pageIdRevList.addAll(pageIdRevMap.keySet()); + pageIdRevMap.clear(); } - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - switch (pageParser.getPageNamespace()) { - case NS_CATEGORY: { - processPageRowCategory(pageParser); - break; - } - case NS_MAIN: { - processPageRowPage(pageParser); - break; - } + + @Override + public void freeAfterTextParsing() + { + pageIdRevMap.clear(); + pageIdRevList.clear(); + disambiguations.clear(); + textIdPageIdMap.clear(); + pPageIdNameMap.clear(); + pNamePageIdMap.clear(); + cNamePageIdMap.clear(); + rPageIdNameMap.clear(); } - } + @Override + public void initialize(Timestamp timestamp) + { + this.timestamp = Revision.compressTime(timestamp.getTime()); + + /* + * filled in revisions + */ + pageIdRevMap = new HashMap<>(); + textIdPageIdMap = new HashMap<>(); + + /* + * filled in pages + */ + pPageIdNameMap = new HashMap<>(); + pNamePageIdMap = new HashMap<>(); + + cNamePageIdMap = new HashMap<>(); + rPageIdNameMap = new HashMap<>(); + + /* + * filled in categories + */ + disambiguations = new HashSet<>(); + } - @SuppressWarnings("unchecked") - private void processPageRowCategory(PageParser pageParser) throws IOException { - if (!(skipCategory && pageParser.getPageIsRedirect())) { - // retrieve page id and page title - int page_id = pageParser.getPageId(); - // ignore categories, which have no revisions before the time stamp - if (pageIdRevList.contains(page_id)) { - String page_title = pageParser.getPageTitle(); - // cache the retrieved values - // record category - if (page_title != null) { - KeyType page_titleHashcode = (KeyType) hashAlgorithm - .hashCode(page_title); - cNamePageIdMap.put(page_titleHashcode, page_id); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - metaData.addCategory(); + @SuppressWarnings("unchecked") + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException + { + String cl_to_text = clParser.getClTo(); + if (cl_to_text != null) { + KeyType cl_to_textHashcode = (KeyType) hashAlgorithm.hashCode(cl_to_text); + // if category exists + + Integer cl_to = cNamePageIdMap.get(cl_to_textHashcode); + if (cl_to != null) { + // if the link source is a page then write the link in + // category_pages and page_categories + int cl_from = clParser.getClFrom(); + // if exists page + if (pPageIdNameMap.containsKey(cl_from)) { + processCategoryLinksRowPageExists(cl_from, cl_to, cl_to_text); + } + else { + processCateforyLinksRowPageMiss(cl_from, cl_to); + } + } + } + } + + private void processCategoryLinksRowPageExists(Integer cl_from, Integer cl_to, + String cl_to_text) + throws IOException + { + categoryPages.addRow(cl_to, cl_from); + pageCategories.addRow(cl_from, cl_to); + if (cl_to_text.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from); + metaData.addDisamb(); + } + } + + private void processCateforyLinksRowPageMiss(Integer cl_from, Integer cl_to) throws IOException + { + // if category page id exists + if (cNamePageIdMap.containsValue(cl_from)) { + categoryOutlinks.addRow(cl_to, cl_from); + categoryInlinks.addRow(cl_from, cl_to); } - } } - } - - @SuppressWarnings("unchecked") - private void processPageRowPage(PageParser pageParser) throws IOException { - // retrieve page id and title - int page_id = pageParser.getPageId(); - // ignore pages, which have no revisions prior to the time stamp - String page_title = pageParser.getPageTitle(); - if (page_title != null && pageIdRevList.contains(page_id)) { - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - // record redirect - rPageIdNameMap.put(page_id, page_title); - } else { - // record page - KeyType page_titleHashcode = (KeyType) hashAlgorithm - .hashCode(page_title); - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_titleHashcode, page_id); - } + + @SuppressWarnings("unchecked") + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException + { + int pl_from = plParser.getPlFrom(); + String pl_to = plParser.getPlTo(); + if (pl_to != null) { + KeyType pl_toHashcode = (KeyType) hashAlgorithm.hashCode(pl_to); + + // if page name and page page id exists + Integer id = pNamePageIdMap.get(pl_toHashcode); + if (id != null && (!skipPage || pPageIdNameMap.containsKey(pl_from))) { + pageOutlinks.addRow(pl_from, id); + pageInlinks.addRow(id, pl_from); + } + } + } + + @Override + public void processPageRow(PageParser pageParser) throws IOException + { + switch (pageParser.getPageNamespace()) { + case NS_CATEGORY: { + processPageRowCategory(pageParser); + break; + } + case NS_MAIN: { + processPageRowPage(pageParser); + break; + } + } + + } + + @SuppressWarnings("unchecked") + private void processPageRowCategory(PageParser pageParser) throws IOException + { + if (!(skipCategory && pageParser.getPageIsRedirect())) { + // retrieve page id and page title + int page_id = pageParser.getPageId(); + // ignore categories, which have no revisions before the time stamp + if (pageIdRevList.contains(page_id)) { + String page_title = pageParser.getPageTitle(); + // cache the retrieved values + // record category + if (page_title != null) { + KeyType page_titleHashcode = (KeyType) hashAlgorithm.hashCode(page_title); + cNamePageIdMap.put(page_titleHashcode, page_id); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + metaData.addCategory(); + } + } + } } - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - // get the time stamp of the revision - int rev_timestamp = revisionParser.getRevTimestamp(); - if (rev_timestamp < timestamp) { - // get the rev_page (corresponds to page_id in the table page) - int rev_page = revisionParser.getRevPage(); - if (pageIdRevMap.containsKey(rev_page)) { - processRevisionRowContainsKey(revisionParser, rev_page, - rev_timestamp); - } else { - processRevisionRowMissKey(revisionParser, rev_page, - rev_timestamp); - } + + @SuppressWarnings("unchecked") + private void processPageRowPage(PageParser pageParser) throws IOException + { + // retrieve page id and title + int page_id = pageParser.getPageId(); + // ignore pages, which have no revisions prior to the time stamp + String page_title = pageParser.getPageTitle(); + if (page_title != null && pageIdRevList.contains(page_id)) { + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + // record redirect + rPageIdNameMap.put(page_id, page_title); + } + else { + // record page + KeyType page_titleHashcode = (KeyType) hashAlgorithm.hashCode(page_title); + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_titleHashcode, page_id); + } + } } - } + @Override + public void processRevisionRow(RevisionParser revisionParser) + { + // get the time stamp of the revision + int rev_timestamp = revisionParser.getRevTimestamp(); + if (rev_timestamp < timestamp) { + // get the rev_page (corresponds to page_id in the table page) + int rev_page = revisionParser.getRevPage(); + if (pageIdRevMap.containsKey(rev_page)) { + processRevisionRowContainsKey(revisionParser, rev_page, rev_timestamp); + } + else { + processRevisionRowMissKey(revisionParser, rev_page, rev_timestamp); + } + } + + } - private void processRevisionRowContainsKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { + private void processRevisionRowContainsKey(RevisionParser revisionParser, int rev_page, + int rev_timestamp) + { - long revisionRecord = pageIdRevMap.get(rev_page); - int old_timestamp = Revision.getTimestamp(revisionRecord); + long revisionRecord = pageIdRevMap.get(rev_page); + int old_timestamp = Revision.getTimestamp(revisionRecord); - // is it a better time stamp ? - if (rev_timestamp > old_timestamp) { - int old_text_id = Revision.getTextId(revisionRecord); - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser - .getRevTextId(), rev_timestamp)); - textIdPageIdMap.remove(old_text_id); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + // is it a better time stamp ? + if (rev_timestamp > old_timestamp) { + int old_text_id = Revision.getTextId(revisionRecord); + pageIdRevMap.put(rev_page, + Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); + textIdPageIdMap.remove(old_text_id); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } } - } - - private void processRevisionRowMissKey(RevisionParser revisionParser, int rev_page, int rev_timestamp) { - // this is the first recorded time stamp for that page id - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - int text_id = textParser.getOldId(); - - if (textIdPageIdMap.containsKey(text_id)) { - int page_id = textIdPageIdMap.get(text_id); - // if exists page page id -> page - if (pPageIdNameMap.containsKey(page_id)) { - processTextRowPage(textParser, page_id); - } else if (rPageIdNameMap.containsKey(page_id)) { - // if exists redirect -> redirect - processTextRowRedirect(textParser, page_id); - } + + private void processRevisionRowMissKey(RevisionParser revisionParser, int rev_page, + int rev_timestamp) + { + // this is the first recorded time stamp for that page id + pageIdRevMap.put(rev_page, + Revision.createRevision(revisionParser.getRevTextId(), rev_timestamp)); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); } - } + @Override + public void processTextRow(TextParser textParser) throws IOException + { + int text_id = textParser.getOldId(); + + if (textIdPageIdMap.containsKey(text_id)) { + int page_id = textIdPageIdMap.get(text_id); + // if exists page page id -> page + if (pPageIdNameMap.containsKey(page_id)) { + processTextRowPage(textParser, page_id); + } + else if (rPageIdNameMap.containsKey(page_id)) { + // if exists redirect -> redirect + processTextRowRedirect(textParser, page_id); + } + } - private void processTextRowPage(TextParser textParser, int page_id) throws IOException { - // get page name - String pageName = pPageIdNameMap.get(page_id); + } - page.addRow(page_id, page_id, pageName, textParser.getOldText(), formatBoolean(disambiguations.contains(page_id))); - pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); - metaData.addPage(); - } + private void processTextRowPage(TextParser textParser, int page_id) throws IOException + { + // get page name + String pageName = pPageIdNameMap.get(page_id); - @SuppressWarnings("unchecked") - private void processTextRowRedirect(TextParser textParser, int page_id) throws IOException { - String destination = Redirects.getRedirectDestination(textParser.getOldText()); + page.addRow(page_id, page_id, pageName, textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, pageName, page_id, SQL_NULL, SQL_NULL); + metaData.addPage(); + } - if (destination != null) { - // if page name exists + @SuppressWarnings("unchecked") + private void processTextRowRedirect(TextParser textParser, int page_id) throws IOException + { + String destination = Redirects.getRedirectDestination(textParser.getOldText()); + + if (destination != null) { + // if page name exists + + KeyType destinationHashcode = (KeyType) hashAlgorithm.hashCode(destination); + Integer id = pNamePageIdMap.get(destinationHashcode); + if (id != null) { + String redirectName = rPageIdNameMap.get(page_id); + pageRedirects.addRow(id, redirectName); + pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, SQL_NULL); + metaData.addRedirect(); + } + } + } - KeyType destinationHashcode = (KeyType) hashAlgorithm.hashCode(destination); - Integer id = pNamePageIdMap.get(destinationHashcode); - if (id != null) { - String redirectName = rPageIdNameMap.get(page_id); - pageRedirects.addRow(id, redirectName); - pageMapLine.addRow(page_id, redirectName, id, SQL_NULL, SQL_NULL); - metaData.addRedirect(); - } + @Override + public void writeMetaData() throws IOException + { + TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp + outputFile.addRow(metaData.getId(), metaData.getLanguage(), + metaData.getDisambiguationCategory(), metaData.getMainCategory(), + metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), + TimestampUtil.toMediaWikiString(metaData.getTimestamp())); + outputFile.flush(); + outputFile.close(); } - } - - @Override - public void writeMetaData() throws IOException { - TxtFileWriter outputFile = new TxtFileWriter(versionFiles.getOutputMetadata()); - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp - outputFile.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), - metaData.getMainCategory(), metaData.getNrOfPages(), metaData.getNrOfRedirects(), - metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), - TimestampUtil.toMediaWikiString(metaData.getTimestamp())); - outputFile.flush(); - outputFile.close(); - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java index 990b52a3..6681f52c 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKIntKeyFactory.java @@ -21,16 +21,21 @@ import org.dkpro.jwpl.wikimachine.dump.version.IDumpVersionFactory; import org.dkpro.jwpl.wikimachine.hashing.StringHashCodeJDK; -public class DumpVersionJDKIntKeyFactory implements IDumpVersionFactory { +public class DumpVersionJDKIntKeyFactory + implements IDumpVersionFactory +{ - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new DumpVersionJDKGeneric(StringHashCodeJDK.class); - } catch (Exception e) { - dumpVersion = null; + @Override + public IDumpVersion getDumpVersion() + { + IDumpVersion dumpVersion; + try { + dumpVersion = new DumpVersionJDKGeneric( + StringHashCodeJDK.class); + } + catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; } - return dumpVersion; - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java index cd636aa7..6a5f34fb 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKLongKeyFactory.java @@ -21,17 +21,22 @@ import org.dkpro.jwpl.wikimachine.dump.version.IDumpVersionFactory; import org.dkpro.jwpl.wikimachine.hashing.StringHashCodeJBoss; -public class DumpVersionJDKLongKeyFactory implements IDumpVersionFactory { +public class DumpVersionJDKLongKeyFactory + implements IDumpVersionFactory +{ - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new DumpVersionJDKGeneric(StringHashCodeJBoss.class); - } catch (Exception e) { - dumpVersion = null; + @Override + public IDumpVersion getDumpVersion() + { + IDumpVersion dumpVersion; + try { + dumpVersion = new DumpVersionJDKGeneric( + StringHashCodeJBoss.class); + } + catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; } - return dumpVersion; - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java index ac8ac04d..f529412d 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/DumpVersionJDKStringKeyFactory.java @@ -21,17 +21,22 @@ import org.dkpro.jwpl.wikimachine.dump.version.IDumpVersionFactory; import org.dkpro.jwpl.wikimachine.hashing.StringHashCodeDisabled; -public class DumpVersionJDKStringKeyFactory implements IDumpVersionFactory { +public class DumpVersionJDKStringKeyFactory + implements IDumpVersionFactory +{ - @Override - public IDumpVersion getDumpVersion() { - IDumpVersion dumpVersion; - try { - dumpVersion = new DumpVersionJDKGeneric(StringHashCodeDisabled.class); - } catch (Exception e) { - dumpVersion = null; + @Override + public IDumpVersion getDumpVersion() + { + IDumpVersion dumpVersion; + try { + dumpVersion = new DumpVersionJDKGeneric( + StringHashCodeDisabled.class); + } + catch (Exception e) { + dumpVersion = null; + } + return dumpVersion; } - return dumpVersion; - } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java index 4d1c34ac..148689c3 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/version/OriginalDumpVersion.java @@ -42,468 +42,523 @@ /** * This class holds the data for a specific dump version. */ -public class OriginalDumpVersion implements IDumpVersion { - - private Timestamp timestamp; - private MetaData metaData; - - // XXX ivan.galkin - @SuppressWarnings("unused") - private String outputPath; - // XXX ivan.galkin - // private Map pageIdRevMap; // maps page id's to - // Revision - // objects - private final Set disambiguations; // caches the page id's of - // disambiguation pages. - private final Map textIdPageIdMap;// maps text id's to the page - // id's. - private final Map pPageIdNameMap;// maps page id's of pages to - // their names - private final Map cPageIdNameMap;// maps page id's of categories - // to their names - private final Map pNamePageIdMap;// maps names of pages to their - // page id's. - private final Map cNamePageIdMap;// maps names of categories to - // their page id's. - private final Map rPageIdNameMap;// maps page id's of redirects - - // to their names. - - // XXX ivan.galkin - private Files versionFiles; - private final Map pageIdRevMap; - private boolean skipCategory = true; - private boolean skipPage = true; - - /** - * Creates a new DumpVersion that corresponds to the specified time stamp. - * - * @param timestamp - */ - public OriginalDumpVersion(Timestamp timestamp) { +public class OriginalDumpVersion + implements IDumpVersion +{ + + private Timestamp timestamp; + private MetaData metaData; + // XXX ivan.galkin - // this.timestamp = timestamp; - // pageIdRevMap = new HashMap(); - pageIdRevMap = new HashMap<>(); - disambiguations = new HashSet<>(); - textIdPageIdMap = new HashMap<>(); - pPageIdNameMap = new HashMap<>(); - cPageIdNameMap = new HashMap<>(); - pNamePageIdMap = new HashMap<>(); - cNamePageIdMap = new HashMap<>(); - rPageIdNameMap = new HashMap<>(); - - } - - @Override - public void setMetaData(MetaData metaData) { - this.metaData = metaData; - } - - public void setOutputPath(String outputPath) throws IOException { - this.outputPath = outputPath; - File directory = new File(outputPath); - directory.mkdir(); - } - - @Override - public void processRevisionRow(RevisionParser revisionParser) { - int rev_page; - Timestamp rev_timestamp; - Timestamp old_timestamp; - int old_text_id; - // get the rev_page (corresponds to page_id in the table page) - rev_page = revisionParser.getRevPage(); - // get the timestamp of the revision + @SuppressWarnings("unused") + private String outputPath; + // XXX ivan.galkin + // private Map pageIdRevMap; // maps page id's to + // Revision + // objects + private final Set disambiguations; // caches the page id's of + // disambiguation pages. + private final Map textIdPageIdMap;// maps text id's to the page + // id's. + private final Map pPageIdNameMap;// maps page id's of pages to + // their names + private final Map cPageIdNameMap;// maps page id's of categories + // to their names + private final Map pNamePageIdMap;// maps names of pages to their + // page id's. + private final Map cNamePageIdMap;// maps names of categories to + // their page id's. + private final Map rPageIdNameMap;// maps page id's of redirects + + // to their names. // XXX ivan.galkin - rev_timestamp = new Timestamp(Revision.extractTime(revisionParser.getRevTimestamp())); - - if (rev_timestamp.before(timestamp)) { - - if (pageIdRevMap.containsKey(rev_page)) { - // XXX ivan.galkin go back to the time stamp classes - old_timestamp = new Timestamp(Revision.extractTime(Revision.getTimestamp(pageIdRevMap.get(rev_page)))); - old_text_id = Revision.getTextId(pageIdRevMap.get(rev_page)); - // is it a better time stamp ? - if (rev_timestamp.after(old_timestamp)) { - pageIdRevMap.remove(rev_page); - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), Revision - .compressTime(rev_timestamp.getTime()))); - textIdPageIdMap.remove(old_text_id); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + private Files versionFiles; + private final Map pageIdRevMap; + private boolean skipCategory = true; + private boolean skipPage = true; + + /** + * Creates a new DumpVersion that corresponds to the specified time stamp. + * + * @param timestamp + */ + public OriginalDumpVersion(Timestamp timestamp) + { + // XXX ivan.galkin + // this.timestamp = timestamp; + // pageIdRevMap = new HashMap(); + pageIdRevMap = new HashMap<>(); + disambiguations = new HashSet<>(); + textIdPageIdMap = new HashMap<>(); + pPageIdNameMap = new HashMap<>(); + cPageIdNameMap = new HashMap<>(); + pNamePageIdMap = new HashMap<>(); + cNamePageIdMap = new HashMap<>(); + rPageIdNameMap = new HashMap<>(); + + } + + @Override + public void setMetaData(MetaData metaData) + { + this.metaData = metaData; + } + + public void setOutputPath(String outputPath) throws IOException + { + this.outputPath = outputPath; + File directory = new File(outputPath); + directory.mkdir(); + } + + @Override + public void processRevisionRow(RevisionParser revisionParser) + { + int rev_page; + Timestamp rev_timestamp; + Timestamp old_timestamp; + int old_text_id; + // get the rev_page (corresponds to page_id in the table page) + rev_page = revisionParser.getRevPage(); + // get the timestamp of the revision + + // XXX ivan.galkin + rev_timestamp = new Timestamp(Revision.extractTime(revisionParser.getRevTimestamp())); + + if (rev_timestamp.before(timestamp)) { + + if (pageIdRevMap.containsKey(rev_page)) { + // XXX ivan.galkin go back to the time stamp classes + old_timestamp = new Timestamp( + Revision.extractTime(Revision.getTimestamp(pageIdRevMap.get(rev_page)))); + old_text_id = Revision.getTextId(pageIdRevMap.get(rev_page)); + // is it a better time stamp ? + if (rev_timestamp.after(old_timestamp)) { + pageIdRevMap.remove(rev_page); + pageIdRevMap.put(rev_page, + Revision.createRevision(revisionParser.getRevTextId(), + Revision.compressTime(rev_timestamp.getTime()))); + textIdPageIdMap.remove(old_text_id); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + } + else { + // this is the first recorded time stamp for that page id + pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), + Revision.compressTime(rev_timestamp.getTime()))); + textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); + } + + } + } + + TxtFileWriter txtFW = null; + + @Override + public void initPageParsing() throws IOException + { + // XXX ivan.galkin + // txtFW = new TxtFileWriter(outputPath + "/Category.txt"); + txtFW = new TxtFileWriter(versionFiles.getOutputCategory()); + } + + @Override + public void processPageRow(PageParser pageParser) throws IOException + { + + int page_id; + int page_namespace; + String page_title; + page_namespace = pageParser.getPageNamespace(); + // handle categories + if (page_namespace == 14) { + if (skipCategory && pageParser.getPageIsRedirect()) + // skip categories that are redirects + return; + // retrieve page id and page title + page_id = pageParser.getPageId(); + // ignore categories, which have no revisions before the timestamp + if (!pageIdRevMap.containsKey(page_id)) + return; + + page_title = pageParser.getPageTitle(); + + // cache the retrieved values + recordCategory(page_id, page_title); + // write a new row in the table Category. + // Note that we also consider the page_id as id + txtFW.addRow(page_id, page_id, page_title); + metaData.addCategory(); + return; + } + // handle pages + if (page_namespace == 0) { + // retrieve page id and title + page_id = pageParser.getPageId(); + page_title = pageParser.getPageTitle(); + // ignore pages, which habe no revisions prior to the timestamp + if (!pageIdRevMap.containsKey(page_id)) + return; + // distinguish redirects + if (pageParser.getPageIsRedirect()) { + recordRedirect(page_id, page_title); + } + else { + recordPage(page_id, page_title); + } } - } else { - // this is the first recorded time stamp for that page id - pageIdRevMap.put(rev_page, Revision.createRevision(revisionParser.getRevTextId(), Revision - .compressTime(rev_timestamp.getTime()))); - textIdPageIdMap.put(revisionParser.getRevTextId(), rev_page); - } } - } - TxtFileWriter txtFW = null; + @Override + public void exportAfterPageParsing() throws IOException + { + txtFW.export(); + } - @Override - public void initPageParsing() throws IOException { - // XXX ivan.galkin - // txtFW = new TxtFileWriter(outputPath + "/Category.txt"); - txtFW = new TxtFileWriter(versionFiles.getOutputCategory()); - } - - @Override - public void processPageRow(PageParser pageParser) throws IOException { - - int page_id; - int page_namespace; - String page_title; - page_namespace = pageParser.getPageNamespace(); - // handle categories - if (page_namespace == 14) { - if (skipCategory && pageParser.getPageIsRedirect()) - // skip categories that are redirects - return; - // retrieve page id and page title - page_id = pageParser.getPageId(); - // ignore categories, which have no revisions before the timestamp - if (!pageIdRevMap.containsKey(page_id)) - return; - - page_title = pageParser.getPageTitle(); - - // cache the retrieved values - recordCategory(page_id, page_title); - // write a new row in the table Category. - // Note that we also consider the page_id as id - txtFW.addRow(page_id, page_id, page_title); - metaData.addCategory(); - return; - } - // handle pages - if (page_namespace == 0) { - // retrieve page id and title - page_id = pageParser.getPageId(); - page_title = pageParser.getPageTitle(); - // ignore pages, which habe no revisions prior to the timestamp - if (!pageIdRevMap.containsKey(page_id)) - return; - // distinguish redirects - if (pageParser.getPageIsRedirect()) { - recordRedirect(page_id, page_title); - } else { - recordPage(page_id, page_title); - } - } - - } - - @Override - public void exportAfterPageParsing() throws IOException { - txtFW.export(); - } - - private TxtFileWriter pageCategories = null; - private TxtFileWriter categoryPages = null; - private TxtFileWriter categoryInlinks = null; - private TxtFileWriter categoryOutlinks = null; - - @Override - public void initCategoryLinksParsing() throws IOException { - // XXX ivan.galkin - // pageCategories = new TxtFileWriter(outputPath + File.separator - // + "page_categories.txt"); - // categoryPages = new TxtFileWriter(outputPath + File.separator - // + "category_pages.txt"); - // categoryInlinks = new TxtFileWriter(outputPath + File.separator - // + "category_inlinks.txt"); - // categoryOutlinks = new TxtFileWriter(outputPath + File.separator - // + "category_outlinks.txt"); - - pageCategories = new TxtFileWriter(versionFiles.getOutputPageCategories()); - categoryPages = new TxtFileWriter(versionFiles.getOutputCategoryPages()); - categoryInlinks = new TxtFileWriter(versionFiles.getOutputCategoryInlinks()); - categoryOutlinks = new TxtFileWriter(versionFiles.getOutputCategoryOutlinks()); - - } - - @Override - public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException { - int cl_from; - String cl_to; - - cl_from = clParser.getClFrom(); - cl_to = clParser.getClTo(); - if (!existsCategory(cl_to)) {// discard links with non registred targets - return; - } - // if the link source is a page then write the link in category_pages - // and - // page_categories - if (existsPage(cl_from)) { - - categoryPages.addRow(getCategoryPageId(cl_to), cl_from); - pageCategories.addRow(cl_from, getCategoryPageId(cl_to)); - if (cl_to.equals(metaData.getDisambiguationCategory())) { - disambiguations.add(cl_from); - metaData.addDisamb(); - } - } else { - // if the link source is a category than write the link in - // category_inlinks and category_outlinks - if (existsCategoryPageId(cl_from)) { - categoryOutlinks.addRow(getCategoryPageId(cl_to), cl_from); - categoryInlinks.addRow(cl_from, getCategoryPageId(cl_to)); - } - } - } - - @Override - public void exportAfterCategoryLinksParsing() throws IOException { - // Export the written tables - pageCategories.export(); - categoryPages.export(); - categoryInlinks.export(); - categoryOutlinks.export(); - } - - private TxtFileWriter pageInlinks = null; - private TxtFileWriter pageOutlinks = null; - - @Override - public void initPageLinksParsing() throws IOException { - // XXX ivan.galkin - // pageInlinks = new TxtFileWriter(outputPath + File.separator - // + "page_inlinks.txt"); - // pageOutlinks = new TxtFileWriter(outputPath + File.separator - // + "page_outlinks.txt"); - pageInlinks = new TxtFileWriter(versionFiles.getOutputPageInlinks()); - pageOutlinks = new TxtFileWriter(versionFiles.getOutputPageOutlinks()); - } - - @Override - public void processPageLinksRow(PagelinksParser plParser) throws IOException { - int pl_from; - String pl_to; - pl_from = plParser.getPlFrom(); - pl_to = plParser.getPlTo(); - // skip redirects or page with other namespace than 0 - if (skipPage && !existsPagePageId(pl_from) || !existsPageName(pl_to)) { - return; - } - pageOutlinks.addRow(pl_from, getPagePageId(pl_to)); - pageInlinks.addRow(getPagePageId(pl_to), pl_from); - } - - public void exportAfterPageLinksProcessing() throws IOException { - // export the written tables - pageInlinks.export(); - pageOutlinks.export(); - } - - private TxtFileWriter page = null; - private TxtFileWriter pageMapLine = null; - private TxtFileWriter pageRedirects = null; - - @Override - public void initTextParsing() throws IOException { - // XXX ivan.galkin - // page = new TxtFileWriter(outputPath + File.separator + "Page.txt"); - // pageMapLine = new TxtFileWriter(outputPath + File.separator - // + "PageMapLine.txt"); - // pageRedirects = new TxtFileWriter(outputPath + File.separator - // + "page_redirects.txt"); - page = new TxtFileWriter(versionFiles.getOutputPage()); - pageMapLine = new TxtFileWriter(versionFiles.getOutputPageMapLine()); - pageRedirects = new TxtFileWriter(versionFiles.getOutputPageRedirects()); - } - - @Override - public void processTextRow(TextParser textParser) throws IOException { - String destination; - int text_id; - int page_id; - text_id = textParser.getOldId(); - if (!textIdPageIdMap.containsKey(text_id)) - return; - page_id = textIdPageIdMap.get(text_id); - if (existsPagePageId(page_id)) {// pages - page.addRow(page_id, page_id, getPageName(page_id), textParser.getOldText(), - formatBoolean(disambiguations.contains(page_id))); - pageMapLine.addRow(page_id, getPageName(page_id), page_id, "NULL", "NULL"); - metaData.addPage(); - return; - } - if (existsRedirect(page_id)) {// Redirects - destination = Redirects.getRedirectDestination(textParser.getOldText()); - if (!existsPageName(destination)) - return; - pageRedirects.addRow(getPagePageId(destination), getRedirectName(page_id)); - pageMapLine.addRow(page_id, getRedirectName(page_id), getPagePageId(destination), "NULL", "NULL"); - metaData.addRedirect(); - } - } - - @Override - public void exportAfterTextParsing() throws IOException { - // export the written tables - page.export(); - pageRedirects.export(); - pageMapLine.export(); - } - - @Override - public void writeMetaData() throws IOException { - // XXX ivan.galkin - // TxtFileWriter metaData_ = new TxtFileWriter(outputPath + File.separator + "MetaData.txt"); - try (TxtFileWriter metaData_ = new TxtFileWriter(versionFiles.getOutputMetadata())) { - // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp - metaData_.addRow(metaData.getId(), metaData.getLanguage(), metaData.getDisambiguationCategory(), - metaData.getMainCategory(), metaData.getNrOfPages(), metaData.getNrOfRedirects(), - metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), - TimestampUtil.toMediaWikiString(metaData.getTimestamp())); - System.out.println("-------------------------------"); - System.out.println("Timestamp : " + timestamp.toString()); - System.out.println("nrOfCategories : " + metaData.getNrOfCategories()); - System.out.println("nrOfPages : " + metaData.getNrOfPages()); - System.out.println("nrOfRedirects : " + metaData.getNrOfRedirects()); - System.out.println("nrOfDisambiguations: " + metaData.getNrOfDisambiguations()); - metaData_.export(); - } - } - - /** - * Returns the String value of the bit 1 if the given boolean is true
- * and an empty String otherwise. This the way bit values are written
- * in .txt dump files. - * - * @param b - * @return - */ - private String formatBoolean(boolean b) { - return b ? new String(new byte[]{1}) : ""; - } - - public void recordCategory(int page_id, String page_title) { - cPageIdNameMap.put(page_id, page_title); - cNamePageIdMap.put(page_title, page_id); - } - - public void recordPage(int page_id, String page_title) { - pPageIdNameMap.put(page_id, page_title); - pNamePageIdMap.put(page_title, page_id); - } - - public void recordRedirect(int page_id, String page_title) { - rPageIdNameMap.put(page_id, page_title); - } - - public boolean existsCategory(String name) { - return cNamePageIdMap.containsKey(name); - } - - public boolean existsPageName(String name) { - return pNamePageIdMap.containsKey(name); - } - - public boolean existsPage(int page_id) { - return pPageIdNameMap.containsKey(page_id); - } - - public boolean existsCategoryPageId(int page_id) { - return cPageIdNameMap.containsKey(page_id); - } - - public boolean existsPagePageId(int page_id) { - return pPageIdNameMap.containsKey(page_id); - } - - public int getPagePageId(String name) { - return pNamePageIdMap.get(name); - } - - public int getCategoryPageId(String name) { - return cNamePageIdMap.get(name); - } - - public String getPageName(int page_id) { - return pPageIdNameMap.get(page_id); - } - - public boolean existsRedirect(int page_id) { - return rPageIdNameMap.containsKey(page_id); - } - - public String getRedirectName(int page_id) { - return rPageIdNameMap.get(page_id); - } - - /* - * implemented methods from IDumpVersion interface - */ - - @Override - public void initialize(Timestamp timestamp) { - this.timestamp = timestamp; - } - - @Override - public void setFiles(Files versionFiles) { - this.versionFiles = versionFiles; - } - - /* - * not implemented methods - */ - - @Override - public void exportAfterPageLinksParsing() throws IOException { - } - - @Override - public void exportAfterRevisionParsing() throws IOException { - } - - @Override - public void flushByTextParsing() throws IOException { - } - - @Override - public void freeAfterCategoryLinksParsing() { - } - - @Override - public void freeAfterPageLinksParsing() { - } - - @Override - public void freeAfterPageParsing() { - } - - @Override - public void freeAfterRevisonParsing() { - } - - @Override - public void freeAfterTextParsing() { - } - - @Override - public void initRevisionParsion() { - } - - @Override - public void setLogger(ILogger logger) { - } - - @Override - public void setCategoryRedirectsSkip(boolean skipCategory) { - this.skipCategory = skipCategory; - } - - @Override - public void setPageRedirectsSkip(boolean skipPage) { - this.skipPage = skipPage; - } + private TxtFileWriter pageCategories = null; + private TxtFileWriter categoryPages = null; + private TxtFileWriter categoryInlinks = null; + private TxtFileWriter categoryOutlinks = null; + + @Override + public void initCategoryLinksParsing() throws IOException + { + // XXX ivan.galkin + // pageCategories = new TxtFileWriter(outputPath + File.separator + // + "page_categories.txt"); + // categoryPages = new TxtFileWriter(outputPath + File.separator + // + "category_pages.txt"); + // categoryInlinks = new TxtFileWriter(outputPath + File.separator + // + "category_inlinks.txt"); + // categoryOutlinks = new TxtFileWriter(outputPath + File.separator + // + "category_outlinks.txt"); + + pageCategories = new TxtFileWriter(versionFiles.getOutputPageCategories()); + categoryPages = new TxtFileWriter(versionFiles.getOutputCategoryPages()); + categoryInlinks = new TxtFileWriter(versionFiles.getOutputCategoryInlinks()); + categoryOutlinks = new TxtFileWriter(versionFiles.getOutputCategoryOutlinks()); + + } + + @Override + public void processCategoryLinksRow(CategorylinksParser clParser) throws IOException + { + int cl_from; + String cl_to; + + cl_from = clParser.getClFrom(); + cl_to = clParser.getClTo(); + if (!existsCategory(cl_to)) {// discard links with non registred targets + return; + } + // if the link source is a page then write the link in category_pages + // and + // page_categories + if (existsPage(cl_from)) { + + categoryPages.addRow(getCategoryPageId(cl_to), cl_from); + pageCategories.addRow(cl_from, getCategoryPageId(cl_to)); + if (cl_to.equals(metaData.getDisambiguationCategory())) { + disambiguations.add(cl_from); + metaData.addDisamb(); + } + } + else { + // if the link source is a category than write the link in + // category_inlinks and category_outlinks + if (existsCategoryPageId(cl_from)) { + categoryOutlinks.addRow(getCategoryPageId(cl_to), cl_from); + categoryInlinks.addRow(cl_from, getCategoryPageId(cl_to)); + } + } + } + + @Override + public void exportAfterCategoryLinksParsing() throws IOException + { + // Export the written tables + pageCategories.export(); + categoryPages.export(); + categoryInlinks.export(); + categoryOutlinks.export(); + } + + private TxtFileWriter pageInlinks = null; + private TxtFileWriter pageOutlinks = null; + + @Override + public void initPageLinksParsing() throws IOException + { + // XXX ivan.galkin + // pageInlinks = new TxtFileWriter(outputPath + File.separator + // + "page_inlinks.txt"); + // pageOutlinks = new TxtFileWriter(outputPath + File.separator + // + "page_outlinks.txt"); + pageInlinks = new TxtFileWriter(versionFiles.getOutputPageInlinks()); + pageOutlinks = new TxtFileWriter(versionFiles.getOutputPageOutlinks()); + } + + @Override + public void processPageLinksRow(PagelinksParser plParser) throws IOException + { + int pl_from; + String pl_to; + pl_from = plParser.getPlFrom(); + pl_to = plParser.getPlTo(); + // skip redirects or page with other namespace than 0 + if (skipPage && !existsPagePageId(pl_from) || !existsPageName(pl_to)) { + return; + } + pageOutlinks.addRow(pl_from, getPagePageId(pl_to)); + pageInlinks.addRow(getPagePageId(pl_to), pl_from); + } + + public void exportAfterPageLinksProcessing() throws IOException + { + // export the written tables + pageInlinks.export(); + pageOutlinks.export(); + } + + private TxtFileWriter page = null; + private TxtFileWriter pageMapLine = null; + private TxtFileWriter pageRedirects = null; + + @Override + public void initTextParsing() throws IOException + { + // XXX ivan.galkin + // page = new TxtFileWriter(outputPath + File.separator + "Page.txt"); + // pageMapLine = new TxtFileWriter(outputPath + File.separator + // + "PageMapLine.txt"); + // pageRedirects = new TxtFileWriter(outputPath + File.separator + // + "page_redirects.txt"); + page = new TxtFileWriter(versionFiles.getOutputPage()); + pageMapLine = new TxtFileWriter(versionFiles.getOutputPageMapLine()); + pageRedirects = new TxtFileWriter(versionFiles.getOutputPageRedirects()); + } + + @Override + public void processTextRow(TextParser textParser) throws IOException + { + String destination; + int text_id; + int page_id; + text_id = textParser.getOldId(); + if (!textIdPageIdMap.containsKey(text_id)) + return; + page_id = textIdPageIdMap.get(text_id); + if (existsPagePageId(page_id)) {// pages + page.addRow(page_id, page_id, getPageName(page_id), textParser.getOldText(), + formatBoolean(disambiguations.contains(page_id))); + pageMapLine.addRow(page_id, getPageName(page_id), page_id, "NULL", "NULL"); + metaData.addPage(); + return; + } + if (existsRedirect(page_id)) {// Redirects + destination = Redirects.getRedirectDestination(textParser.getOldText()); + if (!existsPageName(destination)) + return; + pageRedirects.addRow(getPagePageId(destination), getRedirectName(page_id)); + pageMapLine.addRow(page_id, getRedirectName(page_id), getPagePageId(destination), + "NULL", "NULL"); + metaData.addRedirect(); + } + } + + @Override + public void exportAfterTextParsing() throws IOException + { + // export the written tables + page.export(); + pageRedirects.export(); + pageMapLine.export(); + } + + @Override + public void writeMetaData() throws IOException + { + // XXX ivan.galkin + // TxtFileWriter metaData_ = new TxtFileWriter(outputPath + File.separator + + // "MetaData.txt"); + try (TxtFileWriter metaData_ = new TxtFileWriter(versionFiles.getOutputMetadata())) { + // ID,LANGUAGE,DISAMBIGUATION_CATEGORY,MAIN_CATEGORY,nrOfPages,nrOfRedirects,nrOfDisambiguationPages,nrOfCategories,timestamp + metaData_.addRow(metaData.getId(), metaData.getLanguage(), + metaData.getDisambiguationCategory(), metaData.getMainCategory(), + metaData.getNrOfPages(), metaData.getNrOfRedirects(), + metaData.getNrOfDisambiguations(), metaData.getNrOfCategories(), + TimestampUtil.toMediaWikiString(metaData.getTimestamp())); + System.out.println("-------------------------------"); + System.out.println("Timestamp : " + timestamp.toString()); + System.out.println("nrOfCategories : " + metaData.getNrOfCategories()); + System.out.println("nrOfPages : " + metaData.getNrOfPages()); + System.out.println("nrOfRedirects : " + metaData.getNrOfRedirects()); + System.out.println("nrOfDisambiguations: " + metaData.getNrOfDisambiguations()); + metaData_.export(); + } + } + + /** + * Returns the String value of the bit 1 if the given boolean is true
+ * and an empty String otherwise. This the way bit values are written
+ * in .txt dump files. + * + * @param b + * @return + */ + private String formatBoolean(boolean b) + { + return b ? new String(new byte[] { 1 }) : ""; + } + + public void recordCategory(int page_id, String page_title) + { + cPageIdNameMap.put(page_id, page_title); + cNamePageIdMap.put(page_title, page_id); + } + + public void recordPage(int page_id, String page_title) + { + pPageIdNameMap.put(page_id, page_title); + pNamePageIdMap.put(page_title, page_id); + } + + public void recordRedirect(int page_id, String page_title) + { + rPageIdNameMap.put(page_id, page_title); + } + + public boolean existsCategory(String name) + { + return cNamePageIdMap.containsKey(name); + } + + public boolean existsPageName(String name) + { + return pNamePageIdMap.containsKey(name); + } + + public boolean existsPage(int page_id) + { + return pPageIdNameMap.containsKey(page_id); + } + + public boolean existsCategoryPageId(int page_id) + { + return cPageIdNameMap.containsKey(page_id); + } + + public boolean existsPagePageId(int page_id) + { + return pPageIdNameMap.containsKey(page_id); + } + + public int getPagePageId(String name) + { + return pNamePageIdMap.get(name); + } + + public int getCategoryPageId(String name) + { + return cNamePageIdMap.get(name); + } + + public String getPageName(int page_id) + { + return pPageIdNameMap.get(page_id); + } + + public boolean existsRedirect(int page_id) + { + return rPageIdNameMap.containsKey(page_id); + } + + public String getRedirectName(int page_id) + { + return rPageIdNameMap.get(page_id); + } + + /* + * implemented methods from IDumpVersion interface + */ + + @Override + public void initialize(Timestamp timestamp) + { + this.timestamp = timestamp; + } + + @Override + public void setFiles(Files versionFiles) + { + this.versionFiles = versionFiles; + } + + /* + * not implemented methods + */ + + @Override + public void exportAfterPageLinksParsing() throws IOException + { + } + + @Override + public void exportAfterRevisionParsing() throws IOException + { + } + + @Override + public void flushByTextParsing() throws IOException + { + } + + @Override + public void freeAfterCategoryLinksParsing() + { + } + + @Override + public void freeAfterPageLinksParsing() + { + } + + @Override + public void freeAfterPageParsing() + { + } + + @Override + public void freeAfterRevisonParsing() + { + } + + @Override + public void freeAfterTextParsing() + { + } + + @Override + public void initRevisionParsion() + { + } + + @Override + public void setLogger(ILogger logger) + { + } + + @Override + public void setCategoryRedirectsSkip(boolean skipCategory) + { + this.skipCategory = skipCategory; + } + + @Override + public void setPageRedirectsSkip(boolean skipPage) + { + this.skipPage = skipPage; + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java index eaf15139..c79bb882 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageReader.java @@ -23,36 +23,41 @@ import org.dkpro.jwpl.wikimachine.dump.xml.AbstractXmlDumpReader; /** - * This class is a specified variant of XmlDumpReader. Please see its source for more - * information about a functionality and a license.
+ * This class is a specified variant of XmlDumpReader. Please see its source for more information + * about a functionality and a license.
*/ -public class PageReader extends AbstractXmlDumpReader { +public class PageReader + extends AbstractXmlDumpReader +{ - public PageReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); - } + public PageReader(InputStream inputStream, DumpWriter writer) + { + super(inputStream, writer); + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() + { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TIMESTAMP, TIMESTAMP); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupEndElements() + { + endElements.put(REVISION, REVISION); + endElements.put(TIMESTAMP, TIMESTAMP); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java index 8c10d4aa..6a8ba030 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/PageWriter.java @@ -28,64 +28,75 @@ import org.dkpro.jwpl.wikimachine.util.Redirects; import org.dkpro.jwpl.wikimachine.util.UTFDataOutputStream; -public class PageWriter implements DumpWriter { +public class PageWriter + implements DumpWriter +{ - private Page currentPage; - private Revision lastRevision; - private final UTFDataOutputStream stream; + private Page currentPage; + private Revision lastRevision; + private final UTFDataOutputStream stream; - public PageWriter(OutputStream output) throws IOException { - this.stream = new UTFDataOutputStream(output); - } - - @Override - public void close() throws IOException { - stream.close(); - } + public PageWriter(OutputStream output) throws IOException + { + this.stream = new UTFDataOutputStream(output); + } - @Override - public void writeEndPage() throws IOException { - if (lastRevision != null) { - updatePage(currentPage, lastRevision); + @Override + public void close() throws IOException + { + stream.close(); } - currentPage = null; - lastRevision = null; - } + @Override + public void writeEndPage() throws IOException + { + if (lastRevision != null) { + updatePage(currentPage, lastRevision); + } + currentPage = null; + lastRevision = null; - @Override - public void writeEndWiki() throws IOException { - stream.flush(); - } + } + + @Override + public void writeEndWiki() throws IOException + { + stream.flush(); + } - @Override - public void writeRevision(Revision revision) throws IOException { + @Override + public void writeRevision(Revision revision) throws IOException + { - lastRevision = revision; + lastRevision = revision; - } + } - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { + @Override + public void writeSiteinfo(Siteinfo info) throws IOException + { - } + } - @Override - public void writeStartPage(Page page) throws IOException { - currentPage = page; - lastRevision = null; - } + @Override + public void writeStartPage(Page page) throws IOException + { + currentPage = page; + lastRevision = null; + } - @Override - public void writeStartWiki() throws IOException { - } + @Override + public void writeStartWiki() throws IOException + { + } - private void updatePage(Page page, Revision revision) throws IOException { - stream.writeInt(page.Id); - stream.writeInt(page.Title.Namespace); - String wellformedTitle = SQLEscape.titleFormat(page.Title.Text); - stream.writeUTFAsArray(SQLEscape.escape(wellformedTitle)); - // stream.writeBoolean(revision.isRedirect()); - stream.writeBoolean(Redirects.isRedirect(revision.Text)); - } + private void updatePage(Page page, Revision revision) throws IOException + { + stream.writeInt(page.Id); + stream.writeInt(page.Title.Namespace); + String wellformedTitle = SQLEscape.titleFormat(page.Title.Text); + stream.writeUTFAsArray(SQLEscape.escape(wellformedTitle)); + // stream.writeBoolean(revision.isRedirect()); + stream.writeBoolean(Redirects.isRedirect(revision.Text)); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java index 53f39e51..83921063 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionReader.java @@ -23,36 +23,41 @@ import org.dkpro.jwpl.wikimachine.dump.xml.AbstractXmlDumpReader; /** - * This class is a specified variant of XmlDumpReader. Please see its source for more - * information about a functionality and a license.
+ * This class is a specified variant of XmlDumpReader. Please see its source for more information + * about a functionality and a license.
*/ -public class RevisionReader extends AbstractXmlDumpReader { +public class RevisionReader + extends AbstractXmlDumpReader +{ - public RevisionReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); - } + public RevisionReader(InputStream inputStream, DumpWriter writer) + { + super(inputStream, writer); + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() + { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TIMESTAMP, TIMESTAMP); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupEndElements() + { + endElements.put(REVISION, REVISION); + endElements.put(TIMESTAMP, TIMESTAMP); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java index 785a7d81..008d6d73 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/RevisionWriter.java @@ -26,48 +26,58 @@ import org.dkpro.jwpl.mwdumper.importer.Revision; import org.dkpro.jwpl.mwdumper.importer.Siteinfo; -public class RevisionWriter implements DumpWriter { +public class RevisionWriter + implements DumpWriter +{ - private Page currentPage; - private final DataOutputStream stream; + private Page currentPage; + private final DataOutputStream stream; - public RevisionWriter(OutputStream output) throws IOException { - this.stream = new DataOutputStream(output); - } + public RevisionWriter(OutputStream output) throws IOException + { + this.stream = new DataOutputStream(output); + } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException + { + stream.close(); + } - @Override - public void writeEndPage() throws IOException { - currentPage = null; - } + @Override + public void writeEndPage() throws IOException + { + currentPage = null; + } - @Override - public void writeEndWiki() throws IOException { - stream.flush(); - } + @Override + public void writeEndWiki() throws IOException + { + stream.flush(); + } - @Override - public void writeRevision(Revision revision) throws IOException { - stream.writeInt(currentPage.Id); - stream.writeInt(revision.Id); - stream.writeLong(revision.Timestamp.getTimeInMillis()); - } + @Override + public void writeRevision(Revision revision) throws IOException + { + stream.writeInt(currentPage.Id); + stream.writeInt(revision.Id); + stream.writeLong(revision.Timestamp.getTimeInMillis()); + } - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { + @Override + public void writeSiteinfo(Siteinfo info) throws IOException + { - } + } - @Override - public void writeStartPage(Page page) throws IOException { - currentPage = page; - } + @Override + public void writeStartPage(Page page) throws IOException + { + currentPage = page; + } - @Override - public void writeStartWiki() throws IOException { - } + @Override + public void writeStartWiki() throws IOException + { + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java index b7c44aab..0ff60926 100644 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextReader.java @@ -23,35 +23,40 @@ import org.dkpro.jwpl.wikimachine.dump.xml.AbstractXmlDumpReader; /** - * This class is a specified variant of XmlDumpReader. Please see its source for more - * information about a functionality and a license.
+ * This class is a specified variant of XmlDumpReader. Please see its source for more information + * about a functionality and a license.
*/ -public class TextReader extends AbstractXmlDumpReader { +public class TextReader + extends AbstractXmlDumpReader +{ - public TextReader(InputStream inputStream, DumpWriter writer) { - super(inputStream, writer); - } + public TextReader(InputStream inputStream, DumpWriter writer) + { + super(inputStream, writer); + } - @Override - protected void setupStartElements() { - startElements.put(REVISION, REVISION); - startElements.put(CONTRIBUTOR, CONTRIBUTOR); - startElements.put(PAGE, PAGE); - startElements.put(SITEINFO, SITEINFO); - startElements.put(NAMESPACES, NAMESPACES); - startElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupStartElements() + { + startElements.put(REVISION, REVISION); + startElements.put(CONTRIBUTOR, CONTRIBUTOR); + startElements.put(PAGE, PAGE); + startElements.put(SITEINFO, SITEINFO); + startElements.put(NAMESPACES, NAMESPACES); + startElements.put(NAMESPACE, NAMESPACE); + } - @Override - protected void setupEndElements() { - endElements.put(REVISION, REVISION); - endElements.put(TEXT, TEXT); - endElements.put(CONTRIBUTOR, CONTRIBUTOR); - endElements.put(ID, ID); - endElements.put(PAGE, PAGE); - endElements.put(TITLE, TITLE); - endElements.put(SITEINFO, SITEINFO); - endElements.put(NAMESPACES, NAMESPACES); - endElements.put(NAMESPACE, NAMESPACE); - } + @Override + protected void setupEndElements() + { + endElements.put(REVISION, REVISION); + endElements.put(TEXT, TEXT); + endElements.put(CONTRIBUTOR, CONTRIBUTOR); + endElements.put(ID, ID); + endElements.put(PAGE, PAGE); + endElements.put(TITLE, TITLE); + endElements.put(SITEINFO, SITEINFO); + endElements.put(NAMESPACES, NAMESPACES); + endElements.put(NAMESPACE, NAMESPACE); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java index 0f055c49..936ac81d 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TextWriter.java @@ -27,43 +27,53 @@ import org.dkpro.jwpl.wikimachine.dump.sql.SQLEscape; import org.dkpro.jwpl.wikimachine.util.UTFDataOutputStream; -public class TextWriter implements DumpWriter { +public class TextWriter + implements DumpWriter +{ - private final UTFDataOutputStream stream; + private final UTFDataOutputStream stream; - public TextWriter(OutputStream output) throws IOException { - this.stream = new UTFDataOutputStream(output); - } + public TextWriter(OutputStream output) throws IOException + { + this.stream = new UTFDataOutputStream(output); + } - @Override - public void close() throws IOException { - stream.close(); - } + @Override + public void close() throws IOException + { + stream.close(); + } - @Override - public void writeEndPage() throws IOException { - } + @Override + public void writeEndPage() throws IOException + { + } - @Override - public void writeEndWiki() throws IOException { - stream.flush(); - } + @Override + public void writeEndWiki() throws IOException + { + stream.flush(); + } - @Override - public void writeRevision(Revision revision) throws IOException { - stream.writeInt(revision.Id); - stream.writeUTFAsArray(SQLEscape.escape(revision.Text)); - } + @Override + public void writeRevision(Revision revision) throws IOException + { + stream.writeInt(revision.Id); + stream.writeUTFAsArray(SQLEscape.escape(revision.Text)); + } - @Override - public void writeSiteinfo(Siteinfo info) throws IOException { - } + @Override + public void writeSiteinfo(Siteinfo info) throws IOException + { + } - @Override - public void writeStartPage(Page page) throws IOException { - } + @Override + public void writeStartPage(Page page) throws IOException + { + } - @Override - public void writeStartWiki() throws IOException { - } + @Override + public void writeStartWiki() throws IOException + { + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java index da78bc33..9f28b259 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/TimeMachineRevisionParser.java @@ -23,21 +23,25 @@ import org.dkpro.jwpl.timemachine.domain.Revision; import org.dkpro.jwpl.wikimachine.dump.xml.RevisionParser; -public class TimeMachineRevisionParser extends RevisionParser { +public class TimeMachineRevisionParser + extends RevisionParser +{ - @Override - public boolean next() throws IOException { - boolean hasNext = true; - try { - revPage = stream.readInt(); - revTextId = stream.readInt(); - revTimestamp = Revision.compressTime(stream.readLong()); - } catch (EOFException e) { - hasNext = false; - } + @Override + public boolean next() throws IOException + { + boolean hasNext = true; + try { + revPage = stream.readInt(); + revTextId = stream.readInt(); + revTimestamp = Revision.compressTime(stream.readLong()); + } + catch (EOFException e) { + hasNext = false; + } - return hasNext; + return hasNext; - } + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java index c7227ee2..05831564 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStream.java @@ -27,80 +27,89 @@ import org.dkpro.jwpl.wikimachine.dump.xml.DumpTableInputStream; /** - * Decorator for an {@link InputStream}. Converts an XML source to SQL - * result in a separated thread via - * org.mediawiki.importer.XmlDumpReader + * Decorator for an {@link InputStream}. Converts an XML source to SQL result in a separated thread + * via org.mediawiki.importer.XmlDumpReader * *

    *
  • update 18.11.2009 : constructor is replaced by initialize method
  • *
*/ -public class XMLDumpTableInputStream extends DumpTableInputStream { +public class XMLDumpTableInputStream + extends DumpTableInputStream +{ - private static final int BUFFERSIZE = 8192; - /** - * piped result stream, that is buffered for better performance - */ - private BufferedInputStream result; - /** - * thread where the conversion algorithm should run - */ - private XMLDumpTableInputStreamThread xmlInputThread; - - /** - * Decorator for InputStream, which allows to convert an XML input stream to - * SQL - * - * @param inputStream XML input stream - * @throws IOException - */ - @Override - public void initialize(InputStream inputStream, DumpTableEnum table) throws IOException { - - /* - * piped input stream, that allows to read from a decodedStream + private static final int BUFFERSIZE = 8192; + /** + * piped result stream, that is buffered for better performance */ - PipedInputStream unbufferedResult = new PipedInputStream(); - /* - * piped output stream where the conversion thread XMLInputStreamThread is writing in + private BufferedInputStream result; + /** + * thread where the conversion algorithm should run */ - PipedOutputStream decodedStream = new PipedOutputStream(unbufferedResult); - result = new BufferedInputStream(unbufferedResult, BUFFERSIZE); + private XMLDumpTableInputStreamThread xmlInputThread; + + /** + * Decorator for InputStream, which allows to convert an XML input stream to SQL + * + * @param inputStream + * XML input stream + * @throws IOException + */ + @Override + public void initialize(InputStream inputStream, DumpTableEnum table) throws IOException + { + + /* + * piped input stream, that allows to read from a decodedStream + */ + PipedInputStream unbufferedResult = new PipedInputStream(); + /* + * piped output stream where the conversion thread XMLInputStreamThread is + * writing in + */ + PipedOutputStream decodedStream = new PipedOutputStream(unbufferedResult); + result = new BufferedInputStream(unbufferedResult, BUFFERSIZE); - xmlInputThread = new XMLDumpTableInputStreamThread(inputStream, decodedStream, table); - xmlInputThread.start(); + xmlInputThread = new XMLDumpTableInputStreamThread(inputStream, decodedStream, table); + xmlInputThread.start(); - } + } - @Override - public int read() throws IOException { - return result.read(); - } + @Override + public int read() throws IOException + { + return result.read(); + } - @Override - public int available() throws IOException { - return result.available(); - } + @Override + public int available() throws IOException + { + return result.available(); + } - @Override - public void close() throws IOException { - result.close(); - xmlInputThread.abort(); - } + @Override + public void close() throws IOException + { + result.close(); + xmlInputThread.abort(); + } - @Override - public void mark(int readlimit) { - result.mark(readlimit); - } + @Override + public void mark(int readlimit) + { + result.mark(readlimit); + } - @Override - public void reset() throws IOException { - result.reset(); - } + @Override + public void reset() throws IOException + { + result.reset(); + } - @Override - public boolean markSupported() { - return result.markSupported(); - } + @Override + public boolean markSupported() + { + return result.markSupported(); + } } diff --git a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java index 83560296..c45e0146 100755 --- a/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java +++ b/dkpro-jwpl-timemachine/src/main/java/org/dkpro/jwpl/timemachine/dump/xml/XMLDumpTableInputStreamThread.java @@ -31,69 +31,83 @@ /** * Thread for converting of XML stream to SQL stream. */ -class XMLDumpTableInputStreamThread extends Thread { +class XMLDumpTableInputStreamThread + extends Thread +{ - private static final Logger logger = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private static final Logger logger = LoggerFactory + .getLogger(MethodHandles.lookup().lookupClass()); - /** - * Enable the main and category pages as well as discussions - */ - private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; + /** + * Enable the main and category pages as well as discussions + */ + private static final String ENABLED_NAMESPACES = "NS_MAIN,NS_TALK,NS_CATEGORY"; - /** - * Generalization {@link org.dkpro.jwpl.mwdumper.importer.XmlDumpReader} - * that parses the XML dump - */ - private AbstractXmlDumpReader xmlReader; + /** + * Generalization {@link org.dkpro.jwpl.mwdumper.importer.XmlDumpReader} that parses the XML + * dump + */ + private AbstractXmlDumpReader xmlReader; - /** - * completion flag for a conversion process - */ - private boolean isComplete; + /** + * completion flag for a conversion process + */ + private boolean isComplete; - /** - * Initiate input and output streams - * - * @param iStream XML input stream - * @param oStream SQL output stream - * @throws IOException Thrown in case errors occurred. - */ - public XMLDumpTableInputStreamThread(InputStream iStream, OutputStream oStream, DumpTableEnum table) - throws IOException { - super("xml2sql"); + /** + * Initiate input and output streams + * + * @param iStream + * XML input stream + * @param oStream + * SQL output stream + * @throws IOException + * Thrown in case errors occurred. + */ + public XMLDumpTableInputStreamThread(InputStream iStream, OutputStream oStream, + DumpTableEnum table) + throws IOException + { + super("xml2sql"); - switch (table) { - case PAGE: - xmlReader = new PageReader(iStream, new NamespaceFilter(new PageWriter(oStream), ENABLED_NAMESPACES)); - break; - case REVISION: - xmlReader = new RevisionReader(iStream, new NamespaceFilter(new RevisionWriter(oStream), ENABLED_NAMESPACES)); - break; - case TEXT: - xmlReader = new TextReader(iStream, new NamespaceFilter(new TextWriter(oStream), ENABLED_NAMESPACES)); - break; + switch (table) { + case PAGE: + xmlReader = new PageReader(iStream, + new NamespaceFilter(new PageWriter(oStream), ENABLED_NAMESPACES)); + break; + case REVISION: + xmlReader = new RevisionReader(iStream, + new NamespaceFilter(new RevisionWriter(oStream), ENABLED_NAMESPACES)); + break; + case TEXT: + xmlReader = new TextReader(iStream, + new NamespaceFilter(new TextWriter(oStream), ENABLED_NAMESPACES)); + break; + } } - } - @Override - public synchronized void run() { - try { - isComplete = false; - xmlReader.readDump(); - isComplete = true; - } catch (IOException e) { - logger.error(e.getMessage(), e); - throw new RuntimeException(e); + @Override + public synchronized void run() + { + try { + isComplete = false; + xmlReader.readDump(); + isComplete = true; + } + catch (IOException e) { + logger.error(e.getMessage(), e); + throw new RuntimeException(e); + } } - } - /** - * Abort a conversion - */ - public synchronized void abort() { - if (!isComplete) { - xmlReader.abort(); - isComplete = true; + /** + * Abort a conversion + */ + public synchronized void abort() + { + if (!isComplete) { + xmlReader.abort(); + isComplete = true; + } } - } } From e0436181ba00773585f9eed66d9b52911eeb8949 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:27:39 +0100 Subject: [PATCH 11/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-tutorial --- .../jwpl/tutorial/api/T1a_HelloWorld.java | 38 +- .../jwpl/tutorial/api/T1b_HelloWorld.java | 46 +- .../jwpl/tutorial/api/T1c_HelloWorld.java | 64 +-- .../dkpro/jwpl/tutorial/api/T2_PageInfo.java | 74 +-- .../jwpl/tutorial/api/T3_PageDetails.java | 100 ++-- .../jwpl/tutorial/api/T4_Categories.java | 103 ++-- .../dkpro/jwpl/tutorial/api/T5_TownList.java | 85 ++-- .../jwpl/tutorial/api/T6_HelperMethods.java | 42 +- .../tutorial/parser/T1_SimpleParserDemo.java | 42 +- .../tutorial/parser/T2_InternalLinks.java | 59 ++- .../jwpl/tutorial/parser/T3_LinkContexts.java | 34 +- .../parser/T4_InterfacingWithWikipedia.java | 46 +- .../parser/T5_CleaningTemplateImage.java | 53 ++- .../jwpl/tutorial/parser/T6_NestedLists.java | 102 ++-- .../jwpl/tutorial/parser/T7_HtmlFileDemo.java | 28 +- .../dkpro/jwpl/tutorial/parser/TestFile.java | 446 +++++++++--------- 16 files changed, 718 insertions(+), 644 deletions(-) diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java index 7e338058..1cdaacf5 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1a_HelloWorld.java @@ -26,30 +26,32 @@ /** * Tutorial 1a *

- * Get the text of a wikipedia article. - * The text will be formatted with MediaWiki markup. + * Get the text of a wikipedia article. The text will be formatted with MediaWiki markup. *

* Throws an exception, if no page with the given title exists. */ -public class T1a_HelloWorld implements WikiConstants { +public class T1a_HelloWorld + implements WikiConstants +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - // Get the page with title "Hello world". - // May throw an exception, if the page does not exist. - Page page = wiki.getPage("Hello world"); - System.out.println(page.getText()); + // Get the page with title "Hello world". + // May throw an exception, if the page does not exist. + Page page = wiki.getPage("Hello world"); + System.out.println(page.getText()); - } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java index 0d2b3726..c90bf49a 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1b_HelloWorld.java @@ -26,32 +26,36 @@ /** * Tutorial 1b *

- * Get the text of a wikipedia article. - * The text will be formatted with MediaWiki markup. + * Get the text of a wikipedia article. The text will be formatted with MediaWiki markup. *

- * If you do not care about exception handling, but want to avoid crashes on every page that does not exist. + * If you do not care about exception handling, but want to avoid crashes on every page that does + * not exist. */ -public class T1b_HelloWorld implements WikiConstants { +public class T1b_HelloWorld + implements WikiConstants +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - String title = "Hello world"; - if (wiki.existsPage(title)) { - Page page = wiki.getPage(title); - System.out.println(page.getText()); - } else { - System.out.println("Page " + title + " does not exist"); + String title = "Hello world"; + if (wiki.existsPage(title)) { + Page page = wiki.getPage(title); + System.out.println(page.getText()); + } + else { + System.out.println("Page " + title + " does not exist"); + } } - } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java index aa504b77..0990dab8 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T1c_HelloWorld.java @@ -27,42 +27,46 @@ /** * Tutorial 1c *

- * Get the text of a wikipedia article. - * The text will be formatted with MediaWiki markup. + * Get the text of a wikipedia article. The text will be formatted with MediaWiki markup. *

* Handle exceptions. */ -public class T1c_HelloWorld implements WikiConstants { +public class T1c_HelloWorld + implements WikiConstants +{ - public static void main(String[] args) { + public static void main(String[] args) + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = null; - try { - wiki = new Wikipedia(dbConfig); - } catch (WikiInitializationException e1) { - System.out.println("Could not initialize Wikipedia."); - e1.printStackTrace(); - System.exit(1); - } + // Create a new German wikipedia. + Wikipedia wiki = null; + try { + wiki = new Wikipedia(dbConfig); + } + catch (WikiInitializationException e1) { + System.out.println("Could not initialize Wikipedia."); + e1.printStackTrace(); + System.exit(1); + } - // Get the page with title "Hello world". - String title = "Hello world"; - try { - Page page = wiki.getPage(title); - System.out.println(page.getText()); - } catch (WikiApiException e) { - System.out.println("Page " + title + " does not exist"); - e.printStackTrace(); - System.exit(1); + // Get the page with title "Hello world". + String title = "Hello world"; + try { + Page page = wiki.getPage(title); + System.out.println(page.getText()); + } + catch (WikiApiException e) { + System.out.println("Page " + title + " does not exist"); + e.printStackTrace(); + System.exit(1); + } } - } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java index fdfb0aa0..f2cae4de 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T2_PageInfo.java @@ -24,54 +24,58 @@ import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; - /** * Tutorial 2 *

* A page provides a number of informative methods. */ -public class T2_PageInfo implements WikiConstants { +public class T2_PageInfo + implements WikiConstants +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia + Wikipedia wiki = new Wikipedia(dbConfig); - String title = "Hello world"; - Page page; - try { - page = wiki.getPage(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Page " + title + " does not exist"); - } + String title = "Hello world"; + Page page; + try { + page = wiki.getPage(title); + } + catch (WikiPageNotFoundException e) { + throw new WikiApiException("Page " + title + " does not exist"); + } - // the title of the page - System.out.println("Queried string : " + title); - System.out.println("Title : " + page.getTitle()); + // the title of the page + System.out.println("Queried string : " + title); + System.out.println("Title : " + page.getTitle()); - // whether the page is a disambiguation page - System.out.println("IsDisambiguationPage : " + page.isDisambiguation()); + // whether the page is a disambiguation page + System.out.println("IsDisambiguationPage : " + page.isDisambiguation()); - // whether the page is a redirect - // If a page is a redirect, we can use it like a normal page. - // The other infos in this example are transparently served by the page that the redirect points to. - System.out.println("redirect page query : " + page.isRedirect()); + // whether the page is a redirect + // If a page is a redirect, we can use it like a normal page. + // The other infos in this example are transparently served by the page that the redirect + // points to. + System.out.println("redirect page query : " + page.isRedirect()); - // the number of links pointing to this page - System.out.println("# of ingoing links : " + page.getNumberOfInlinks()); + // the number of links pointing to this page + System.out.println("# of ingoing links : " + page.getNumberOfInlinks()); - // the number of links in this page pointing to other pages - System.out.println("# of outgoing links : " + page.getNumberOfOutlinks()); + // the number of links in this page pointing to other pages + System.out.println("# of outgoing links : " + page.getNumberOfOutlinks()); - // the number of categories that are assigned to this page - System.out.println("# of categories : " + page.getNumberOfCategories()); - } + // the number of categories that are assigned to this page + System.out.println("# of categories : " + page.getNumberOfCategories()); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java index 6edd3624..b6d23b47 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T3_PageDetails.java @@ -31,63 +31,67 @@ *

* Even more things to do with a Wikipedia page. */ -public class T3_PageDetails implements WikiConstants { +public class T3_PageDetails + implements WikiConstants +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - String title = "Hello world"; - Page page; - try { - page = wiki.getPage(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Page " + title + " does not exist"); - } + String title = "Hello world"; + Page page; + try { + page = wiki.getPage(title); + } + catch (WikiPageNotFoundException e) { + throw new WikiApiException("Page " + title + " does not exist"); + } - StringBuilder sb = new StringBuilder(); + StringBuilder sb = new StringBuilder(); - // the title of the page - sb.append("Queried string : " + title + LF); - sb.append("Title : " + page.getTitle() + LF); - sb.append(LF); + // the title of the page + sb.append("Queried string : " + title + LF); + sb.append("Title : " + page.getTitle() + LF); + sb.append(LF); - // output the page's redirects - sb.append("Redirects" + LF); - for (String redirect : page.getRedirects()) { - sb.append(" " + new Title(redirect).getPlainTitle() + LF); - } - sb.append(LF); + // output the page's redirects + sb.append("Redirects" + LF); + for (String redirect : page.getRedirects()) { + sb.append(" " + new Title(redirect).getPlainTitle() + LF); + } + sb.append(LF); - // output the page's categories - sb.append("Categories" + LF); - for (Category category : page.getCategories()) { - sb.append(" " + category.getTitle() + LF); - } - sb.append(LF); + // output the page's categories + sb.append("Categories" + LF); + for (Category category : page.getCategories()) { + sb.append(" " + category.getTitle() + LF); + } + sb.append(LF); - // output the ingoing links - sb.append("In-Links" + LF); - for (Page inLinkPage : page.getInlinks()) { - sb.append(" " + inLinkPage.getTitle() + LF); - } - sb.append(LF); + // output the ingoing links + sb.append("In-Links" + LF); + for (Page inLinkPage : page.getInlinks()) { + sb.append(" " + inLinkPage.getTitle() + LF); + } + sb.append(LF); - // output the outgoing links - sb.append("Out-Links" + LF); - for (Page outLinkPage : page.getOutlinks()) { - sb.append(" " + outLinkPage.getTitle() + LF); - } + // output the outgoing links + sb.append("Out-Links" + LF); + for (Page outLinkPage : page.getOutlinks()) { + sb.append(" " + outLinkPage.getTitle() + LF); + } - System.out.println(sb); - } + System.out.println(sb); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java index ca6c538c..b800188f 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T4_Categories.java @@ -28,65 +28,70 @@ /** * Tutorial 4 *

- * Wikipedia categories are used as a kind of semantic tag for pages. - * They are organized in a thesaurus like structure. + * Wikipedia categories are used as a kind of semantic tag for pages. They are organized in a + * thesaurus like structure. */ -public class T4_Categories implements WikiConstants { +public class T4_Categories + implements WikiConstants +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - // Get the category "Säugetiere" (mammals) - String title = "Säugetiere"; - Category cat; - try { - cat = wiki.getCategory(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category " + title + " does not exist"); - } + // Get the category "Säugetiere" (mammals) + String title = "Säugetiere"; + Category cat; + try { + cat = wiki.getCategory(title); + } + catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category " + title + " does not exist"); + } - StringBuilder sb = new StringBuilder(); + StringBuilder sb = new StringBuilder(); - // the title of the category - sb.append("Title : " + cat.getTitle() + LF); - sb.append(LF); + // the title of the category + sb.append("Title : " + cat.getTitle() + LF); + sb.append(LF); - // the number of links pointing to this page (number of superordinate categories) - sb.append("# super categories : " + cat.getParents().size() + LF); - for (Category parent : cat.getParents()) { - sb.append(" " + parent.getTitle() + LF); - } - sb.append(LF); + // the number of links pointing to this page (number of superordinate categories) + sb.append("# super categories : " + cat.getParents().size() + LF); + for (Category parent : cat.getParents()) { + sb.append(" " + parent.getTitle() + LF); + } + sb.append(LF); - // the number of links in this page pointing to other pages (number of subordinate categories) - sb.append("# sub categories : " + cat.getChildren().size() + LF); - for (Category child : cat.getChildren()) { - sb.append(" " + child.getTitle() + LF); - } - sb.append(LF); + // the number of links in this page pointing to other pages (number of subordinate + // categories) + sb.append("# sub categories : " + cat.getChildren().size() + LF); + for (Category child : cat.getChildren()) { + sb.append(" " + child.getTitle() + LF); + } + sb.append(LF); - // the number of pages that are categorized under this category - sb.append("# pages : " + cat.getArticles().size() + LF); - for (Page page : cat.getArticles()) { - sb.append(" " + page.getTitle() + LF); - } + // the number of pages that are categorized under this category + sb.append("# pages : " + cat.getArticles().size() + LF); + for (Page page : cat.getArticles()) { + sb.append(" " + page.getTitle() + LF); + } - // extract only the pageIDs of pages that are categorized under this category - sb.append("# pageIDs : " + cat.getArticleIds().size() + LF); - for (int pageID : cat.getArticleIds()) { - sb.append(" " + pageID + LF); - } + // extract only the pageIDs of pages that are categorized under this category + sb.append("# pageIDs : " + cat.getArticleIds().size() + LF); + for (int pageID : cat.getArticleIds()) { + sb.append(" " + pageID + LF); + } - System.out.println(sb); - } + System.out.println(sb); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java index 33c8f7e7..1df56dcd 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T5_TownList.java @@ -28,58 +28,61 @@ import org.dkpro.jwpl.api.exception.WikiApiException; import org.dkpro.jwpl.api.exception.WikiPageNotFoundException; - /** * Tutorial 5 *

- * Wikipedia categories are used as a kind of semantic tag for pages. - * They are organized in a thesaurus like structure. + * Wikipedia categories are used as a kind of semantic tag for pages. They are organized in a + * thesaurus like structure. *

- * If we get all pages assigned to categories in the sub-tree under the category for "Towns in Germany", - * we can get a quite long list of towns in Germany. + * If we get all pages assigned to categories in the sub-tree under the category for "Towns in + * Germany", we can get a quite long list of towns in Germany. */ -public class T5_TownList implements WikiConstants { +public class T5_TownList + implements WikiConstants +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); - // Get the category "Towns in Germany" - String title = "Towns in Germany"; - Category topCat; - try { - topCat = wiki.getCategory(title); - } catch (WikiPageNotFoundException e) { - throw new WikiApiException("Category " + title + " does not exist"); - } + // Get the category "Towns in Germany" + String title = "Towns in Germany"; + Category topCat; + try { + topCat = wiki.getCategory(title); + } + catch (WikiPageNotFoundException e) { + throw new WikiApiException("Category " + title + " does not exist"); + } - // Add the pages categorized under "Towns in Germany". - Set towns = new TreeSet<>(); - for (Page p : topCat.getArticles()) { - towns.add(p.getTitle().getPlainTitle()); - } + // Add the pages categorized under "Towns in Germany". + Set towns = new TreeSet<>(); + for (Page p : topCat.getArticles()) { + towns.add(p.getTitle().getPlainTitle()); + } - // Get the pages categorized under each subcategory of "Towns in Germany". - for (Category townCategory : topCat.getDescendants()) { - for (Page p : townCategory.getArticles()) { - towns.add(p.getTitle().getPlainTitle()); - } - System.out.println("Number of towns: " + towns.size()); - } + // Get the pages categorized under each subcategory of "Towns in Germany". + for (Category townCategory : topCat.getDescendants()) { + for (Page p : townCategory.getArticles()) { + towns.add(p.getTitle().getPlainTitle()); + } + System.out.println("Number of towns: " + towns.size()); + } - // Output the pages - for (String town : towns) { - System.out.println(town); - } + // Output the pages + for (String town : towns) { + System.out.println(town); + } - } + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java index 6103dc99..77ebeffe 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/api/T6_HelperMethods.java @@ -26,26 +26,28 @@ import org.dkpro.jwpl.api.Wikipedia; import org.dkpro.jwpl.api.exception.WikiInitializationException; -public class T6_HelperMethods { - - public static Set getUniqueArticleTitles() throws WikiInitializationException { - // configure the database connection parameters - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setHost("SERVER_URL"); - dbConfig.setDatabase("DATABASE"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.german); - - // Create a new German wikipedia. - Wikipedia wiki = new Wikipedia(dbConfig); - - Set uniqueArticleTitles = new TreeSet<>(); - for (Title title : wiki.getTitles()) { - uniqueArticleTitles.add(title.getPlainTitle()); +public class T6_HelperMethods +{ + + public static Set getUniqueArticleTitles() throws WikiInitializationException + { + // configure the database connection parameters + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setHost("SERVER_URL"); + dbConfig.setDatabase("DATABASE"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.german); + + // Create a new German wikipedia. + Wikipedia wiki = new Wikipedia(dbConfig); + + Set uniqueArticleTitles = new TreeSet<>(); + for (Title title : wiki.getTitles()) { + uniqueArticleTitles.add(title.getPlainTitle()); + } + + return uniqueArticleTitles; } - return uniqueArticleTitles; - } - } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java index f71cfa83..8afa950b 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T1_SimpleParserDemo.java @@ -27,29 +27,31 @@ /** * Displays informations about the inner structure of a page. */ -public class T1_SimpleParserDemo { +public class T1_SimpleParserDemo +{ - /** - * @param args - * @throws IOException - */ - public static void main(String[] args) throws IOException { + /** + * @param args + * @throws IOException + */ + public static void main(String[] args) throws IOException + { - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); - //get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(documentText); - //get the sections - for (Section section : pp.getSections()) { - System.out.println("section : " + section.getTitle()); - System.out.println(" nr of paragraphs : " + section.nrOfParagraphs()); - System.out.println(" nr of tables : " + section.nrOfTables()); - System.out.println(" nr of nested lists : " + section.nrOfNestedLists()); - System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists()); + // get the sections + for (Section section : pp.getSections()) { + System.out.println("section : " + section.getTitle()); + System.out.println(" nr of paragraphs : " + section.nrOfParagraphs()); + System.out.println(" nr of tables : " + section.nrOfTables()); + System.out.println(" nr of nested lists : " + section.nrOfNestedLists()); + System.out.println(" nr of definition lists: " + section.nrOfDefinitionLists()); + } } - } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java index f3ef1fb9..d6059f67 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T2_InternalLinks.java @@ -27,38 +27,45 @@ /** * This class shows how to get the internal links from a parsed page.
* Internal links point to other pages and categories in the current
- *

Wikipedia
. + * + *
+ * Wikipedia
+ * 
+ * + * . */ -public class T2_InternalLinks { +public class T2_InternalLinks +{ - /** - * Prints the targets of the internal links found in the page Germany. - * - * @param args - * @throws WikiApiException - */ - public static void main(String[] args) throws WikiApiException { + /** + * Prints the targets of the internal links found in the page Germany. + * + * @param args + * @throws WikiApiException + */ + public static void main(String[] args) throws WikiApiException + { - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); - // get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(documentText); - // only the links to other Wikipedia language editions - for (Link language : pp.getLanguages()) { - System.out.println(language.getTarget()); - } + // only the links to other Wikipedia language editions + for (Link language : pp.getLanguages()) { + System.out.println(language.getTarget()); + } - //get the internal links of each section - for (Section section : pp.getSections()) { - System.out.println("Section: " + section.getTitle()); + // get the internal links of each section + for (Section section : pp.getSections()) { + System.out.println("Section: " + section.getTitle()); - for (Link link : section.getLinks(Link.type.INTERNAL)) { - System.out.println(" " + link.getTarget()); - } + for (Link link : section.getLinks(Link.type.INTERNAL)) { + System.out.println(" " + link.getTarget()); + } + } } - } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java index 81354eee..766809ce 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T3_LinkContexts.java @@ -23,29 +23,27 @@ import org.dkpro.jwpl.parser.mediawiki.MediaWikiParserFactory; /** - * This is a little demo, to show how the parsedpage and parsedpage.parser package - * works. + * This is a little demo, to show how the parsedpage and parsedpage.parser package works. */ -public class T3_LinkContexts { +public class T3_LinkContexts +{ - public static void main(String[] args) { + public static void main(String[] args) + { - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); - // get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(documentText); - // Link Context (return 1 token left, 2 token right of the link) - for (Link link : pp.getLinks()) { - System.out.println( - link.getContext(1, 0) + "<" + - link.getText().toString().toUpperCase() + ">" + - link.getContext(0, 2) - ); + // Link Context (return 1 token left, 2 token right of the link) + for (Link link : pp.getLinks()) { + System.out.println(link.getContext(1, 0) + "<" + link.getText().toString().toUpperCase() + + ">" + link.getContext(0, 2)); + } } - } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java index 25653b9c..82324236 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T4_InterfacingWithWikipedia.java @@ -32,33 +32,35 @@ /** * Displays the titles of the sections found in the page Dog.
*/ -public class T4_InterfacingWithWikipedia { +public class T4_InterfacingWithWikipedia +{ - public static void main(String[] args) throws WikiApiException { - //db connection settings - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setDatabase("DATABASE"); - dbConfig.setHost("HOST"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.english); + public static void main(String[] args) throws WikiApiException + { + // db connection settings + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setDatabase("DATABASE"); + dbConfig.setHost("HOST"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.english); - //initialize a wiki - Wikipedia wiki = new Wikipedia(dbConfig); + // initialize a wiki + Wikipedia wiki = new Wikipedia(dbConfig); - //get the page 'Dog' - Page p = wiki.getPage("Dog"); + // get the page 'Dog' + Page p = wiki.getPage("Dog"); - //get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(p.getText()); + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(p.getText()); - //get the sections of the page - List
sections = pp.getSections(); + // get the sections of the page + List
sections = pp.getSections(); - for (Section section : sections) { - System.out.println(section.getTitle()); + for (Section section : sections) { + System.out.println(section.getTitle()); + } } - } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java index 972773d8..46dfda4c 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T5_CleaningTemplateImage.java @@ -31,38 +31,41 @@ * Shows how to clean an article text from "TEMPLATE" and "Image" elements */ -public class T5_CleaningTemplateImage { +public class T5_CleaningTemplateImage +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - //db connection settings - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setDatabase("DATABASE"); - dbConfig.setHost("HOST"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.english); + // db connection settings + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setDatabase("DATABASE"); + dbConfig.setHost("HOST"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.english); - //initialize a wiki - Wikipedia wiki = new Wikipedia(dbConfig); + // initialize a wiki + Wikipedia wiki = new Wikipedia(dbConfig); - //get the page 'Dog' - Page p = wiki.getPage("Dog"); + // get the page 'Dog' + Page p = wiki.getPage("Dog"); - //get a ParsedPage object - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements + // get a ParsedPage object + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + pf.setTemplateParserClass(FlushTemplates.class); // Filtering TEMPLATE-Elements - String IMAGE = "Image"; // Replace it with the image template name in your Wiki language edition, - // e.g. "Image" in English + String IMAGE = "Image"; // Replace it with the image template name in your Wiki language + // edition, + // e.g. "Image" in English - // filtering Image-Elements - pf.getImageIdentifers().add(IMAGE); + // filtering Image-Elements + pf.getImageIdentifers().add(IMAGE); - // parse page text - MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(p.getText()); + // parse page text + MediaWikiParser parser = pf.createParser(); + ParsedPage pp = parser.parse(p.getText()); - System.out.println(pp.getText()); - } + System.out.println(pp.getText()); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java index b257b220..7fcc1c0a 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T6_NestedLists.java @@ -32,63 +32,69 @@ * Displays all nested lists of a page. */ -public class T6_NestedLists { +public class T6_NestedLists +{ - public static void main(String[] args) throws WikiApiException { + public static void main(String[] args) throws WikiApiException + { - //db connection settings - DatabaseConfiguration dbConfig = new DatabaseConfiguration(); - dbConfig.setDatabase("DATABASE"); - dbConfig.setHost("HOST"); - dbConfig.setUser("USER"); - dbConfig.setPassword("PASSWORD"); - dbConfig.setLanguage(Language.english); + // db connection settings + DatabaseConfiguration dbConfig = new DatabaseConfiguration(); + dbConfig.setDatabase("DATABASE"); + dbConfig.setHost("HOST"); + dbConfig.setUser("USER"); + dbConfig.setPassword("PASSWORD"); + dbConfig.setLanguage(Language.english); - //initialize a wiki - Wikipedia wiki = new Wikipedia(dbConfig); + // initialize a wiki + Wikipedia wiki = new Wikipedia(dbConfig); - MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); - MediaWikiParser parser = pf.createParser(); + MediaWikiParserFactory pf = new MediaWikiParserFactory(Language.english); + MediaWikiParser parser = pf.createParser(); - //get the page 'House_(disambiguation)' - ParsedPage pp = parser.parse(wiki.getPage("House_(disambiguation)").getText()); + // get the page 'House_(disambiguation)' + ParsedPage pp = parser.parse(wiki.getPage("House_(disambiguation)").getText()); - int i = 1; - // print out all nested lists of the page - for (NestedList nl : pp.getNestedLists()) { - System.out.println(i + ": \n" + outputNestedList(nl, 0)); - i++; + int i = 1; + // print out all nested lists of the page + for (NestedList nl : pp.getNestedLists()) { + System.out.println(i + ": \n" + outputNestedList(nl, 0)); + i++; + } } - } - /** - * Returns String with all elements of a NestedList - * - * @param nl NestedList - * @param depth Current depth of the Nestedlist - * @return - */ - public static String outputNestedList(NestedList nl, int depth) { - String result = ""; - if (nl == null) { - return result; // If null return empty string - } + /** + * Returns String with all elements of a NestedList + * + * @param nl + * NestedList + * @param depth + * Current depth of the Nestedlist + * @return + */ + public static String outputNestedList(NestedList nl, int depth) + { + String result = ""; + if (nl == null) { + return result; // If null return empty string + } - for (int i = 0; i < depth; i++) { - result += " "; // insert indentation according to depth - } + for (int i = 0; i < depth; i++) { + result += " "; // insert indentation according to depth + } - if (nl.getClass() == NestedListElement.class) { // If it is a NestedListElement, - // we reached a leaf, return its contents - result += nl.getText(); - } else { - result += "---"; // If it is not a NestedListElement, it is a NestedListContainer - // print out all its childs, increment depth - for (NestedList nl2 : ((NestedListContainer) nl).getNestedLists()) { - result += "\n" + outputNestedList(nl2, depth + 1); - } - } + if (nl.getClass() == NestedListElement.class) { // If it is a NestedListElement, + // we reached a leaf, return its contents + result += nl.getText(); + } + else { + result += "---"; // If it is not a NestedListElement, it is a NestedListContainer + // print out all its childs, increment depth + for (NestedList nl2 : ((NestedListContainer) nl).getNestedLists()) { + result += "\n" + outputNestedList(nl2, depth + 1); + } + } - return result; - } + return result; + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java index 8e5bec69..bec8b36a 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/T7_HtmlFileDemo.java @@ -26,23 +26,25 @@ * This class shows how to use the HtmlTools.class...
* Mainly, you can create an HtmlFile of a {@link ParsedPage}. */ -public class T7_HtmlFileDemo { +public class T7_HtmlFileDemo +{ - public static void main(String[] argv) { + public static void main(String[] argv) + { - // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") - String documentText = TestFile.getFileText(); + // load a sample document (the contents are equal to "DarmstadtWikipediaArticle.txt") + String documentText = TestFile.getFileText(); - // set up an individually parametrized MediaWikiParser - MediaWikiParserFactory pf = new MediaWikiParserFactory(); - pf.getImageIdentifers().add("Image"); - MediaWikiParser parser = pf.createParser(); + // set up an individually parametrized MediaWikiParser + MediaWikiParserFactory pf = new MediaWikiParserFactory(); + pf.getImageIdentifers().add("Image"); + MediaWikiParser parser = pf.createParser(); - ParsedPage pp = parser.parse(documentText); + ParsedPage pp = parser.parse(documentText); - String outFileName = "htmlFileDemo.html"; - HtmlWriter.writeFile(outFileName, "UTF8", HtmlWriter.parsedPageToHtml(pp)); + String outFileName = "htmlFileDemo.html"; + HtmlWriter.writeFile(outFileName, "UTF8", HtmlWriter.parsedPageToHtml(pp)); - System.out.println("Writing output to file: " + outFileName); - } + System.out.println("Writing output to file: " + outFileName); + } } diff --git a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java index 93d13fca..5f64a324 100644 --- a/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java +++ b/dkpro-jwpl-tutorial/src/main/java/org/dkpro/jwpl/tutorial/parser/TestFile.java @@ -17,217 +17,243 @@ */ package org.dkpro.jwpl.tutorial.parser; -public class TestFile { +public class TestFile +{ - private static final String LF = "\n"; + private static final String LF = "\n"; - public static String getFileText() { - StringBuilder sb = new StringBuilder(); - sb.append("'''Darmstadt''' is a city in the [[States of Germany|Bundesland]] (federal state) of [[Hesse]]n in [[Germany]]. As of 2005, its population was 139,000. The city is located in the southern part of the [[Frankfurt Rhine Main Area|Rhine Main Metropolitan Area]]."); - sb.append(LF); - sb.append(LF); - sb.append("== History =="); - sb.append(LF); - sb.append("The name Darmstadt first appears towards the end of the [[11th century]], then ''Darmundestat''; Darmstadt was chartered as a city by the [[Holy Roman Emperor]] [[Louis IV, Holy Roman Emperor|Ludwig the Bavarian]] in 1330. The seat of the ruling [[Landgraf|landgraves]] (1567-1806) and thereafter (to 1918) to the [[Grand Duke of Hesse and by Rhine|Grand Dukes of Hesse]], the city grew in population during the [[19th century]] from little over 10,000 to 72,000 inhabitants. A polytechnical school, which later became a Technical University now known as [[Darmstadt University of Technology|TU Darmstadt]], was established in 1877. "); - sb.append(LF); - sb.append("In the beginning of the 20th Century Darmstadt was an important centre for the art movement of [[Art Nouveau|Jugendstil]], the German variant of [[Art Nouveau]]. Annual architectural competitions led to the building of many architectural treasures of this period. Also during this period, in [[1912]] the chemist [[Anton Kollisch]], working for the pharmaceutical company [[Merck]], first synthesised the chemical [[MDMA]] (ecstasy) in Darmstadt."); - sb.append(LF); - sb.append("Darmstadt's municipal area was extended in 1937 to include the neighbouring localities of Arheilgen [not Arheil''i''gen] and Eberstadt, and in 1938 the city was separated administratively from the surrounding district (''Kreis''). Its old city centre was largely destroyed in a [[Bombing of Darmstadt in World War II|British bombing raid]] of [[September 11]] [[1944]], which killed an estimated 12,300 inhabitants and rendered 66,000 homeless. Most of Darmstadt's 3000 [[Jew]]s were killed by the [[Nazism|Nazi]] regime between 1933 and 1945. "); - sb.append(LF); - sb.append("Darmstadt is home to many technology companies and research institutes, and has been promoting itself as a \"city of science\" since 1997. It is well known as the high-tech centre in the vicinity of [[Frankfurt International Airport|Frankfurt Airport]], with important activities in spacecraft operations, chemistry, pharmacy, information technology, biotechnology, telecommunications and mechatronics. The [[Darmstadt University of Technology|TU Darmstadt]] is one of the important technical institutes in Germany and is well known for its research and teaching in the Electrical, Mechanical and Civil Engineering disciplines."); - sb.append(LF); - sb.append(LF); - sb.append("== Institutions =="); - sb.append(LF); - sb.append("Darmstadt is the site of one of the leading German universities, the [[Darmstadt University of Technology]], renowned for its engineering departments and the [[Darmstadt University of Applied Sciences]]. Related institutes are the [[Gesellschaft für Schwerionenforschung]] (see also 'Trivia', below) and the four Institutes of the [[Fraunhofer Society]]. The European Space Operations Center ([[ESOC]]) of the [[European Space Agency]] is located in Darmstadt, as is [[EUMETSAT]], which operates [[meteorological]] [[satellite]]s. Darmstadt is a centre for the pharmaceutical and chemical industry, with [[Merck KGaA|Merck]], Röhm and Schenck RoTec (part of The Dürr Group) having their main plants and centres here."); - sb.append(LF); - sb.append("The [http://www.jazzinstitut.de Jazz-Institut Darmstadt] is Germany's largest publicly accessible [[Jazz]] archive."); - sb.append(LF); - sb.append("The [http://www.imd.darmstadt.de/ Internationales Musikinstitut Darmstadt], harboring one of the world's largest collections of [[post-war]] [[sheet music]], also hosts the biannual ''[[International Summer Courses for New Music|Internationale Ferienkurse für Neue Musik]]'', a summer school in [[contemporary classical music]] founded by [[Wolfgang Steinecke]]. A large number of avant-garde [[composer]]s have attended and given lectures there, including [[Olivier Messiaen]], [[Luciano Berio]], [[Milton Babbitt]], [[Pierre Boulez]], [[John Cage]], [[György Ligeti]], [[Iannis Xenakis]], [[Karlheinz Stockhausen]] and [[Mauricio Kagel]]."); - sb.append(LF); - sb.append("The [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung] (German Academy for Language and Poetry) provides writers and scholars with a place to research the German language. The Academy's annual ''Georg-Büchner-Preis'', named in memory of [[Georg Büchner]], is considered the most renowned literary award for writers of German language."); - sb.append(LF); - sb.append(LF); - sb.append("== Sons and Daughters of the City =="); - sb.append(LF); - sb.append("* Justus von Liebig, Chemist "); - sb.append(LF); - sb.append("* Georg Büchner, German Poet"); - sb.append(LF); - sb.append("* [[Dr Walter Köbel]], German politician"); - sb.append(LF); - sb.append("* [[August Anton Ullrich]], German industrialist (1865-1919)"); - sb.append(LF); - sb.append("* [[Fabian Scheuermann]], World traveller"); - sb.append(LF); - sb.append("* [[Björn Phau]], Tennis player"); - sb.append(LF); - sb.append("* [[Friedrich August Kekulé von Stradonitz]], Organic Chemist"); - sb.append(LF); - sb.append(LF); - sb.append("== Military =="); - sb.append(LF); - sb.append("There are still [[U.S. Army]] personnel stationed in the Darmstadt area. Just outside the Darmstadt centre is the U.S. Army Garrison Darmstadt on Cambrai-Fritsch Kaserne. The barracks was originally built in the 1930s as two separate German Army barracks (Cambrai Kaserne and Freiherr von Fritsch Kaserne). "); - sb.append(LF); - sb.append("It is possible to listen to the military entertainment radio for the American troops in the region. The station is called [[American Forces Network|AFN Europe]] and broadcasts from Frankfurt on FM 98.7 or AM 873."); - sb.append(LF); - sb.append("The base has already started deactivation and will be closed around 2008-2010, at that time AFN Europe will be moved to Mannheim."); - sb.append(LF); - sb.append(LF); - sb.append("== Trivia =="); - sb.append(LF); - sb.append("Literally translated, the German name \"Darmstadt\" means \"City of the intestine\". But that is just a coincidence, as the name derives from the medieval name \"darmundestat\". The Darm(bach) is a small creek running through the city."); - sb.append(LF); - sb.append("The [[chemical element]] [[Darmstadtium]] ([[atomic number]]: 110), first discovered at the [[Gesellschaft für Schwerionenforschung]] was named after the city in 2003, making Darmstadt only the sixth city with an element named after it (the other five cities are [[Ytterby]] in [[Sweden]] (four elements); [[Strontian]] in [[Scotland]]; [[Copenhagen]] in [[Denmark]] (whose latin name gives [[Hafnium]]); [[Berkeley, California]]; and [[Dubna]] in [[Russia]]). [[Meitnerium]] ([[atomic number]]: 109) (1982), [[Hassium]] ([[atomic number]]: 108) (1984) and [[Roentgenium]] ([[atomic number]]: 111) (1994) and [[Ununbium]] ([[atomic number]]: 112) (1996) were also synthesized in this facility."); - sb.append(LF); - sb.append("Darmstadt also happens to be one of the small number of cities worldwide which do not lie close to a river or coast."); - sb.append(LF); - sb.append("Darmstadt is the home of [[Software AG]], a software company."); - sb.append(LF); - sb.append("Frankenstein Castle, ''[http://de.wikipedia.org/wiki/Burg_Frankenstein_%28Bergstrasse%29 Burg Frankenstein]'' (in German), possibly Mary Shelley's inspiration for the title of her famous 1818 novel ''[[Frankenstein | Frankenstein; or, The Modern Prometheus]]'', is located nearby."); - sb.append(LF); - sb.append(LF); - sb.append("== Twinning =="); - sb.append(LF); - sb.append("Darmstadt is [[twinned]] with:"); - sb.append(LF); - sb.append(LF); - sb.append("*{{flagicon|Netherlands}}[[Alkmaar]], [[Netherlands]]"); - sb.append(LF); - sb.append("*{{flagicon|Italy}}[[Brescia]], [[Italy]]"); - sb.append(LF); - sb.append("*{{flagicon|Turkey}}[[Bursa, Turkey|Bursa]], [[Turkey]]"); - sb.append(LF); - sb.append("*{{flagicon|United Kingdom}}[[Chesterfield]], [[United Kingdom|UK]]"); - sb.append(LF); - sb.append("*{{flagicon|Austria}}[[Graz]], [[Austria]]"); - sb.append(LF); - sb.append("*{{flagicon|Latvia}}[[Liepaja]], [[Latvia]]"); - sb.append(LF); - sb.append("*{{flagicon|Spain}}[[Logroño]], [[Spain]]"); - sb.append(LF); - sb.append("*{{flagicon|Poland}}[[Płock]], [[Poland]]"); - sb.append(LF); - sb.append("*{{flagicon|Hungary}}[[Szeged]], [[Hungary]]"); - sb.append(LF); - sb.append("*{{flagicon|Norway}}[[Trondheim]], [[Norway]]"); - sb.append(LF); - sb.append("*{{flagicon|France}}[[Troyes]], [[France]]"); - sb.append(LF); - sb.append("*{{flagicon|Ukraine}}[[Uzhhorod]], [[Ukraine]]"); - sb.append(LF); - sb.append("*{{flagicon|Switzerland}}[[Saanen]], [[Switzerland]]"); - sb.append(LF); - sb.append(LF); - sb.append("==External links=="); - sb.append(LF); - sb.append("{{commonscat|Darmstadt, Germany}}"); - sb.append(LF); - sb.append("*[http://www.darmstadt.de/ Official site of the city of Darmstadt] (German, English)"); - sb.append(LF); - sb.append("*[[wikitravel:Darmstadt|Darmstadt]] on [[wikitravel:Main Page|Wikitravel]]"); - sb.append(LF); - sb.append("*[http://www.mathildenhoehe.info Mathildenhoehe]"); - sb.append(LF); - sb.append("*[http://public-transport.net/bim/Darmstadt.htm Details of Trams and Buses used in Darmstadt]"); - sb.append(LF); - sb.append("*[http://www.rmv.de/ Public Transport in Darmstadt - Maps, Timetables, Fares]"); - sb.append(LF); - sb.append("*[http://sites-of-memory.de/main/location.html#darmstadt War memorials in Darmstadt]"); - sb.append(LF); - sb.append("*[http://www.darmstadt.army.mil/ Webpage of the U.S. army in Darmstadt]"); - sb.append(LF); - sb.append(LF); - sb.append("===Notable institutions==="); - sb.append(LF); - sb.append("* [http://www.tu-darmstadt.de/index.en.html Darmstadt University of Technology]"); - sb.append(LF); - sb.append("* [http://www.hochschule-darmstadt.de/engl/engl.htm University of Applied Sciences Darmstadt]"); - sb.append(LF); - sb.append("* [http://www.igd.fraunhofer.de/ Fraunhofer Institute for Computer Graphics]"); - sb.append(LF); - sb.append("* [http://www.sit.fraunhofer.de/ Fraunhofer Institute for Secure Information Technology]"); - sb.append(LF); - sb.append("* [http://www.ipsi.fraunhofer.de/ Fraunhofer Institute for Integrated Publication and Information Systems]"); - sb.append(LF); - sb.append("* [http://www.lbf.fhg.de/ Fraunhofer Institute for Structural Durability]"); - sb.append(LF); - sb.append("* [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung]"); - sb.append(LF); - sb.append("* [http://www.gsi.de/ Gesellschaft für Schwerionenforschung]"); - sb.append(LF); - sb.append("* [http://www.esa.int/SPECIALS/ESOC/ European Space Operations Centre] (ESOC)"); - sb.append(LF); - sb.append("* [http://www.eumetsat.int/ European Organisation for the Exploitation of Meteorological Satellites (EUMETSAT)]"); - sb.append(LF); - sb.append(LF); - sb.append("[[Category:Cities in Hesse]]"); - sb.append(LF); - sb.append("[[Category:Merck]]"); - sb.append(LF); - sb.append("[[ar:دارمشتادت]]"); - sb.append(LF); - sb.append("[[an:Darmstadt]]"); - sb.append(LF); - sb.append("[[bg:Дармщат]]"); - sb.append(LF); - sb.append("[[ca:Darmstadt]]"); - sb.append(LF); - sb.append("[[cs:Darmstadt]]"); - sb.append(LF); - sb.append("[[da:Darmstadt]]"); - sb.append(LF); - sb.append("[[de:Darmstadt]]"); - sb.append(LF); - sb.append("[[et:Darmstadt]]"); - sb.append(LF); - sb.append("[[el:Ντάρμστατ]]"); - sb.append(LF); - sb.append("[[es:Darmstadt]]"); - sb.append(LF); - sb.append("[[eo:Darmstadt]]"); - sb.append(LF); - sb.append("[[fr:Darmstadt]]"); - sb.append(LF); - sb.append("[[ko:다름슈타트]]"); - sb.append(LF); - sb.append("[[id:Darmstadt]]"); - sb.append(LF); - sb.append("[[it:Darmstadt]]"); - sb.append(LF); - sb.append("[[la:Darmstadium]]"); - sb.append(LF); - sb.append("[[hu:Darmstadt]]"); - sb.append(LF); - sb.append("[[nl:Darmstadt]]"); - sb.append(LF); - sb.append("[[ja:ダルムシュタット]]"); - sb.append(LF); - sb.append("[[no:Darmstadt]]"); - sb.append(LF); - sb.append("[[nds:Darmstadt]]"); - sb.append(LF); - sb.append("[[pl:Darmstadt]]"); - sb.append(LF); - sb.append("[[pt:Darmstadt]]"); - sb.append(LF); - sb.append("[[ro:Darmstadt]]"); - sb.append(LF); - sb.append("[[ru:Дармштадт]]"); - sb.append(LF); - sb.append("[[simple:Darmstadt]]"); - sb.append(LF); - sb.append("[[fi:Darmstadt]]"); - sb.append(LF); - sb.append("[[sv:Darmstadt]]"); - sb.append(LF); - sb.append("[[tr:Darmstadt]]"); - sb.append(LF); - sb.append("[[vo:Darmstadt]]"); - sb.append(LF); - sb.append("[[zh:达姆施塔特]]"); - sb.append(LF); + public static String getFileText() + { + StringBuilder sb = new StringBuilder(); + sb.append( + "'''Darmstadt''' is a city in the [[States of Germany|Bundesland]] (federal state) of [[Hesse]]n in [[Germany]]. As of 2005, its population was 139,000. The city is located in the southern part of the [[Frankfurt Rhine Main Area|Rhine Main Metropolitan Area]]."); + sb.append(LF); + sb.append(LF); + sb.append("== History =="); + sb.append(LF); + sb.append( + "The name Darmstadt first appears towards the end of the [[11th century]], then ''Darmundestat''; Darmstadt was chartered as a city by the [[Holy Roman Emperor]] [[Louis IV, Holy Roman Emperor|Ludwig the Bavarian]] in 1330. The seat of the ruling [[Landgraf|landgraves]] (1567-1806) and thereafter (to 1918) to the [[Grand Duke of Hesse and by Rhine|Grand Dukes of Hesse]], the city grew in population during the [[19th century]] from little over 10,000 to 72,000 inhabitants. A polytechnical school, which later became a Technical University now known as [[Darmstadt University of Technology|TU Darmstadt]], was established in 1877. "); + sb.append(LF); + sb.append( + "In the beginning of the 20th Century Darmstadt was an important centre for the art movement of [[Art Nouveau|Jugendstil]], the German variant of [[Art Nouveau]]. Annual architectural competitions led to the building of many architectural treasures of this period. Also during this period, in [[1912]] the chemist [[Anton Kollisch]], working for the pharmaceutical company [[Merck]], first synthesised the chemical [[MDMA]] (ecstasy) in Darmstadt."); + sb.append(LF); + sb.append( + "Darmstadt's municipal area was extended in 1937 to include the neighbouring localities of Arheilgen [not Arheil''i''gen] and Eberstadt, and in 1938 the city was separated administratively from the surrounding district (''Kreis''). Its old city centre was largely destroyed in a [[Bombing of Darmstadt in World War II|British bombing raid]] of [[September 11]] [[1944]], which killed an estimated 12,300 inhabitants and rendered 66,000 homeless. Most of Darmstadt's 3000 [[Jew]]s were killed by the [[Nazism|Nazi]] regime between 1933 and 1945. "); + sb.append(LF); + sb.append( + "Darmstadt is home to many technology companies and research institutes, and has been promoting itself as a \"city of science\" since 1997. It is well known as the high-tech centre in the vicinity of [[Frankfurt International Airport|Frankfurt Airport]], with important activities in spacecraft operations, chemistry, pharmacy, information technology, biotechnology, telecommunications and mechatronics. The [[Darmstadt University of Technology|TU Darmstadt]] is one of the important technical institutes in Germany and is well known for its research and teaching in the Electrical, Mechanical and Civil Engineering disciplines."); + sb.append(LF); + sb.append(LF); + sb.append("== Institutions =="); + sb.append(LF); + sb.append( + "Darmstadt is the site of one of the leading German universities, the [[Darmstadt University of Technology]], renowned for its engineering departments and the [[Darmstadt University of Applied Sciences]]. Related institutes are the [[Gesellschaft für Schwerionenforschung]] (see also 'Trivia', below) and the four Institutes of the [[Fraunhofer Society]]. The European Space Operations Center ([[ESOC]]) of the [[European Space Agency]] is located in Darmstadt, as is [[EUMETSAT]], which operates [[meteorological]] [[satellite]]s. Darmstadt is a centre for the pharmaceutical and chemical industry, with [[Merck KGaA|Merck]], Röhm and Schenck RoTec (part of The Dürr Group) having their main plants and centres here."); + sb.append(LF); + sb.append( + "The [http://www.jazzinstitut.de Jazz-Institut Darmstadt] is Germany's largest publicly accessible [[Jazz]] archive."); + sb.append(LF); + sb.append( + "The [http://www.imd.darmstadt.de/ Internationales Musikinstitut Darmstadt], harboring one of the world's largest collections of [[post-war]] [[sheet music]], also hosts the biannual ''[[International Summer Courses for New Music|Internationale Ferienkurse für Neue Musik]]'', a summer school in [[contemporary classical music]] founded by [[Wolfgang Steinecke]]. A large number of avant-garde [[composer]]s have attended and given lectures there, including [[Olivier Messiaen]], [[Luciano Berio]], [[Milton Babbitt]], [[Pierre Boulez]], [[John Cage]], [[György Ligeti]], [[Iannis Xenakis]], [[Karlheinz Stockhausen]] and [[Mauricio Kagel]]."); + sb.append(LF); + sb.append( + "The [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung] (German Academy for Language and Poetry) provides writers and scholars with a place to research the German language. The Academy's annual ''Georg-Büchner-Preis'', named in memory of [[Georg Büchner]], is considered the most renowned literary award for writers of German language."); + sb.append(LF); + sb.append(LF); + sb.append("== Sons and Daughters of the City =="); + sb.append(LF); + sb.append("* Justus von Liebig, Chemist "); + sb.append(LF); + sb.append("* Georg Büchner, German Poet"); + sb.append(LF); + sb.append("* [[Dr Walter Köbel]], German politician"); + sb.append(LF); + sb.append("* [[August Anton Ullrich]], German industrialist (1865-1919)"); + sb.append(LF); + sb.append("* [[Fabian Scheuermann]], World traveller"); + sb.append(LF); + sb.append("* [[Björn Phau]], Tennis player"); + sb.append(LF); + sb.append("* [[Friedrich August Kekulé von Stradonitz]], Organic Chemist"); + sb.append(LF); + sb.append(LF); + sb.append("== Military =="); + sb.append(LF); + sb.append( + "There are still [[U.S. Army]] personnel stationed in the Darmstadt area. Just outside the Darmstadt centre is the U.S. Army Garrison Darmstadt on Cambrai-Fritsch Kaserne. The barracks was originally built in the 1930s as two separate German Army barracks (Cambrai Kaserne and Freiherr von Fritsch Kaserne). "); + sb.append(LF); + sb.append( + "It is possible to listen to the military entertainment radio for the American troops in the region. The station is called [[American Forces Network|AFN Europe]] and broadcasts from Frankfurt on FM 98.7 or AM 873."); + sb.append(LF); + sb.append( + "The base has already started deactivation and will be closed around 2008-2010, at that time AFN Europe will be moved to Mannheim."); + sb.append(LF); + sb.append(LF); + sb.append("== Trivia =="); + sb.append(LF); + sb.append( + "Literally translated, the German name \"Darmstadt\" means \"City of the intestine\". But that is just a coincidence, as the name derives from the medieval name \"darmundestat\". The Darm(bach) is a small creek running through the city."); + sb.append(LF); + sb.append( + "The [[chemical element]] [[Darmstadtium]] ([[atomic number]]: 110), first discovered at the [[Gesellschaft für Schwerionenforschung]] was named after the city in 2003, making Darmstadt only the sixth city with an element named after it (the other five cities are [[Ytterby]] in [[Sweden]] (four elements); [[Strontian]] in [[Scotland]]; [[Copenhagen]] in [[Denmark]] (whose latin name gives [[Hafnium]]); [[Berkeley, California]]; and [[Dubna]] in [[Russia]]). [[Meitnerium]] ([[atomic number]]: 109) (1982), [[Hassium]] ([[atomic number]]: 108) (1984) and [[Roentgenium]] ([[atomic number]]: 111) (1994) and [[Ununbium]] ([[atomic number]]: 112) (1996) were also synthesized in this facility."); + sb.append(LF); + sb.append( + "Darmstadt also happens to be one of the small number of cities worldwide which do not lie close to a river or coast."); + sb.append(LF); + sb.append("Darmstadt is the home of [[Software AG]], a software company."); + sb.append(LF); + sb.append( + "Frankenstein Castle, ''[http://de.wikipedia.org/wiki/Burg_Frankenstein_%28Bergstrasse%29 Burg Frankenstein]'' (in German), possibly Mary Shelley's inspiration for the title of her famous 1818 novel ''[[Frankenstein | Frankenstein; or, The Modern Prometheus]]'', is located nearby."); + sb.append(LF); + sb.append(LF); + sb.append("== Twinning =="); + sb.append(LF); + sb.append("Darmstadt is [[twinned]] with:"); + sb.append(LF); + sb.append(LF); + sb.append("*{{flagicon|Netherlands}}[[Alkmaar]], [[Netherlands]]"); + sb.append(LF); + sb.append("*{{flagicon|Italy}}[[Brescia]], [[Italy]]"); + sb.append(LF); + sb.append("*{{flagicon|Turkey}}[[Bursa, Turkey|Bursa]], [[Turkey]]"); + sb.append(LF); + sb.append("*{{flagicon|United Kingdom}}[[Chesterfield]], [[United Kingdom|UK]]"); + sb.append(LF); + sb.append("*{{flagicon|Austria}}[[Graz]], [[Austria]]"); + sb.append(LF); + sb.append("*{{flagicon|Latvia}}[[Liepaja]], [[Latvia]]"); + sb.append(LF); + sb.append("*{{flagicon|Spain}}[[Logroño]], [[Spain]]"); + sb.append(LF); + sb.append("*{{flagicon|Poland}}[[Płock]], [[Poland]]"); + sb.append(LF); + sb.append("*{{flagicon|Hungary}}[[Szeged]], [[Hungary]]"); + sb.append(LF); + sb.append("*{{flagicon|Norway}}[[Trondheim]], [[Norway]]"); + sb.append(LF); + sb.append("*{{flagicon|France}}[[Troyes]], [[France]]"); + sb.append(LF); + sb.append("*{{flagicon|Ukraine}}[[Uzhhorod]], [[Ukraine]]"); + sb.append(LF); + sb.append("*{{flagicon|Switzerland}}[[Saanen]], [[Switzerland]]"); + sb.append(LF); + sb.append(LF); + sb.append("==External links=="); + sb.append(LF); + sb.append("{{commonscat|Darmstadt, Germany}}"); + sb.append(LF); + sb.append( + "*[http://www.darmstadt.de/ Official site of the city of Darmstadt] (German, English)"); + sb.append(LF); + sb.append("*[[wikitravel:Darmstadt|Darmstadt]] on [[wikitravel:Main Page|Wikitravel]]"); + sb.append(LF); + sb.append("*[http://www.mathildenhoehe.info Mathildenhoehe]"); + sb.append(LF); + sb.append( + "*[http://public-transport.net/bim/Darmstadt.htm Details of Trams and Buses used in Darmstadt]"); + sb.append(LF); + sb.append("*[http://www.rmv.de/ Public Transport in Darmstadt - Maps, Timetables, Fares]"); + sb.append(LF); + sb.append( + "*[http://sites-of-memory.de/main/location.html#darmstadt War memorials in Darmstadt]"); + sb.append(LF); + sb.append("*[http://www.darmstadt.army.mil/ Webpage of the U.S. army in Darmstadt]"); + sb.append(LF); + sb.append(LF); + sb.append("===Notable institutions==="); + sb.append(LF); + sb.append( + "* [http://www.tu-darmstadt.de/index.en.html Darmstadt University of Technology]"); + sb.append(LF); + sb.append( + "* [http://www.hochschule-darmstadt.de/engl/engl.htm University of Applied Sciences Darmstadt]"); + sb.append(LF); + sb.append("* [http://www.igd.fraunhofer.de/ Fraunhofer Institute for Computer Graphics]"); + sb.append(LF); + sb.append( + "* [http://www.sit.fraunhofer.de/ Fraunhofer Institute for Secure Information Technology]"); + sb.append(LF); + sb.append( + "* [http://www.ipsi.fraunhofer.de/ Fraunhofer Institute for Integrated Publication and Information Systems]"); + sb.append(LF); + sb.append("* [http://www.lbf.fhg.de/ Fraunhofer Institute for Structural Durability]"); + sb.append(LF); + sb.append("* [http://www.deutscheakademie.de/ Deutsche Akademie für Sprache und Dichtung]"); + sb.append(LF); + sb.append("* [http://www.gsi.de/ Gesellschaft für Schwerionenforschung]"); + sb.append(LF); + sb.append("* [http://www.esa.int/SPECIALS/ESOC/ European Space Operations Centre] (ESOC)"); + sb.append(LF); + sb.append( + "* [http://www.eumetsat.int/ European Organisation for the Exploitation of Meteorological Satellites (EUMETSAT)]"); + sb.append(LF); + sb.append(LF); + sb.append("[[Category:Cities in Hesse]]"); + sb.append(LF); + sb.append("[[Category:Merck]]"); + sb.append(LF); + sb.append("[[ar:دارمشتادت]]"); + sb.append(LF); + sb.append("[[an:Darmstadt]]"); + sb.append(LF); + sb.append("[[bg:Дармщат]]"); + sb.append(LF); + sb.append("[[ca:Darmstadt]]"); + sb.append(LF); + sb.append("[[cs:Darmstadt]]"); + sb.append(LF); + sb.append("[[da:Darmstadt]]"); + sb.append(LF); + sb.append("[[de:Darmstadt]]"); + sb.append(LF); + sb.append("[[et:Darmstadt]]"); + sb.append(LF); + sb.append("[[el:Ντάρμστατ]]"); + sb.append(LF); + sb.append("[[es:Darmstadt]]"); + sb.append(LF); + sb.append("[[eo:Darmstadt]]"); + sb.append(LF); + sb.append("[[fr:Darmstadt]]"); + sb.append(LF); + sb.append("[[ko:다름슈타트]]"); + sb.append(LF); + sb.append("[[id:Darmstadt]]"); + sb.append(LF); + sb.append("[[it:Darmstadt]]"); + sb.append(LF); + sb.append("[[la:Darmstadium]]"); + sb.append(LF); + sb.append("[[hu:Darmstadt]]"); + sb.append(LF); + sb.append("[[nl:Darmstadt]]"); + sb.append(LF); + sb.append("[[ja:ダルムシュタット]]"); + sb.append(LF); + sb.append("[[no:Darmstadt]]"); + sb.append(LF); + sb.append("[[nds:Darmstadt]]"); + sb.append(LF); + sb.append("[[pl:Darmstadt]]"); + sb.append(LF); + sb.append("[[pt:Darmstadt]]"); + sb.append(LF); + sb.append("[[ro:Darmstadt]]"); + sb.append(LF); + sb.append("[[ru:Дармштадт]]"); + sb.append(LF); + sb.append("[[simple:Darmstadt]]"); + sb.append(LF); + sb.append("[[fi:Darmstadt]]"); + sb.append(LF); + sb.append("[[sv:Darmstadt]]"); + sb.append(LF); + sb.append("[[tr:Darmstadt]]"); + sb.append(LF); + sb.append("[[vo:Darmstadt]]"); + sb.append(LF); + sb.append("[[zh:达姆施塔特]]"); + sb.append(LF); - return sb.toString(); - } + return sb.toString(); + } } From d3587d6a82a8f289cb49c2ccdcae48d054c4d368 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Tue, 31 Oct 2023 14:27:54 +0100 Subject: [PATCH 12/14] #164 - Introduce checkstyle - Auto-format dkpro-jwpl-util --- .../jwpl/util/revisions/RevisionUtils.java | 178 +- .../jwpl/util/templates/RevisionPair.java | 295 +- .../dkpro/jwpl/util/templates/TextPair.java | 531 ++-- .../util/templates/WikipediaTemplateInfo.java | 2824 +++++++++-------- .../generator/GeneratorConstants.java | 9 +- .../generator/simple/GeneratorMode.java | 17 +- .../generator/simple/TemplateFilter.java | 184 +- .../simple/TemplateInfoGeneratorStarter.java | 361 ++- .../WikipediaTemplateInfoDumpWriter.java | 340 +- .../WikipediaTemplateInfoGenerator.java | 564 ++-- .../util/templates/parser/ParseUtils.java | 232 +- .../parser/SectionExtractionTest.java | 55 +- .../templates/parser/SectionExtractor.java | 449 +-- 13 files changed, 3244 insertions(+), 2795 deletions(-) diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java index 08b92cb5..66b3a06c 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/revisions/RevisionUtils.java @@ -32,99 +32,113 @@ /** * Provides several revision-related utilities that should not be part of the RevisionMachine - * package because of dependencies to the JWPL API. RevisionMachine should stay independent - * of the RevisionMachine + * package because of dependencies to the JWPL API. RevisionMachine should stay independent of the + * RevisionMachine * * @deprecated To be removed in future releases. */ @Deprecated(since = "2.0.0", forRemoval = true) -public class RevisionUtils { - private RevisionApi revApi; - private Wikipedia wiki; - - public RevisionUtils(DatabaseConfiguration conf) throws WikiApiException { - wiki = new Wikipedia(conf); - revApi = new RevisionApi(conf); - } - - public RevisionUtils(Wikipedia wiki, RevisionApi revApi) throws WikiApiException { - this.revApi = revApi; - this.wiki = wiki; - } - - /** - * For a given article revision, the method returns the revision of the article discussion - * page which was current at the time the revision was created. - * - * @param revisionId revision of the article for which the talk page revision should be retrieved - * @return the revision of the talk page that was current at the creation time of the given article revision - * @throws WikiApiException if any error occurred accessing the Wiki db - * @throws WikiPageNotFoundException if no discussion page was available at the time of the given article revision - */ - public Revision getDiscussionRevisionForArticleRevision(int revisionId) throws WikiApiException, WikiPageNotFoundException { - //get article revision - Revision rev = revApi.getRevision(revisionId); - Timestamp revTime = rev.getTimeStamp(); - - //get corresponding discussion page - Page discussion = wiki.getDiscussionPage(rev.getArticleID()); - - /* - * find correct revision of discussion page - */ - List discussionTs = revApi.getRevisionTimestamps(discussion.getPageId()); - - // sort in reverse order - newest first - discussionTs.sort(Comparator.reverseOrder()); - - //find first timestamp equal to or before the article revision timestamp - for (Timestamp curDiscTime : discussionTs) { - if (curDiscTime == revTime || curDiscTime.before(revTime)) { - return revApi.getRevision(discussion.getPageId(), curDiscTime); - } +public class RevisionUtils +{ + private RevisionApi revApi; + private Wikipedia wiki; + + public RevisionUtils(DatabaseConfiguration conf) throws WikiApiException + { + wiki = new Wikipedia(conf); + revApi = new RevisionApi(conf); } - throw new WikiPageNotFoundException("Not discussion page was available at the time of the given article revision"); - } - - - /** - * For a given article revision, the method returns the revisions of the archived article discussion - * pages which were available at the time of the article revision - * - * @param revisionId revision of the article for which the talk page archive revisions should be retrieved - * @return the revisions of the talk page archives that were available at the time of the article revision - */ - public List getDiscussionArchiveRevisionsForArticleRevision(int revisionId) throws WikiApiException, WikiPageNotFoundException { - List result = new LinkedList<>(); + public RevisionUtils(Wikipedia wiki, RevisionApi revApi) throws WikiApiException + { + this.revApi = revApi; + this.wiki = wiki; + } - //get article revision - Revision rev = revApi.getRevision(revisionId); - Timestamp revTime = rev.getTimeStamp(); + /** + * For a given article revision, the method returns the revision of the article discussion page + * which was current at the time the revision was created. + * + * @param revisionId + * revision of the article for which the talk page revision should be retrieved + * @return the revision of the talk page that was current at the creation time of the given + * article revision + * @throws WikiApiException + * if any error occurred accessing the Wiki db + * @throws WikiPageNotFoundException + * if no discussion page was available at the time of the given article revision + */ + public Revision getDiscussionRevisionForArticleRevision(int revisionId) + throws WikiApiException, WikiPageNotFoundException + { + // get article revision + Revision rev = revApi.getRevision(revisionId); + Timestamp revTime = rev.getTimeStamp(); + + // get corresponding discussion page + Page discussion = wiki.getDiscussionPage(rev.getArticleID()); + + /* + * find correct revision of discussion page + */ + List discussionTs = revApi.getRevisionTimestamps(discussion.getPageId()); + + // sort in reverse order - newest first + discussionTs.sort(Comparator.reverseOrder()); + + // find first timestamp equal to or before the article revision timestamp + for (Timestamp curDiscTime : discussionTs) { + if (curDiscTime == revTime || curDiscTime.before(revTime)) { + return revApi.getRevision(discussion.getPageId(), curDiscTime); + } + } - //get corresponding discussion archives - Iterable discArchives = wiki.getDiscussionArchives(rev.getArticleID()); + throw new WikiPageNotFoundException( + "Not discussion page was available at the time of the given article revision"); + } - /* - * for each discussion archive, find correct revision of discussion page + /** + * For a given article revision, the method returns the revisions of the archived article + * discussion pages which were available at the time of the article revision + * + * @param revisionId + * revision of the article for which the talk page archive revisions should be + * retrieved + * @return the revisions of the talk page archives that were available at the time of the + * article revision */ - for (Page discArchive : discArchives) { - //get revision timestamps for the current discussion archive - List discussionTs = revApi.getRevisionTimestamps(discArchive.getPageId()); - - // sort in reverse order - newest first - discussionTs.sort(Comparator.reverseOrder()); - - //find first timestamp equal to or before the article revision timestamp - for (Timestamp curDiscTime : discussionTs) { - if (curDiscTime == revTime || curDiscTime.before(revTime)) { - result.add(revApi.getRevision(discArchive.getPageId(), curDiscTime)); - break; + public List getDiscussionArchiveRevisionsForArticleRevision(int revisionId) + throws WikiApiException, WikiPageNotFoundException + { + List result = new LinkedList<>(); + + // get article revision + Revision rev = revApi.getRevision(revisionId); + Timestamp revTime = rev.getTimeStamp(); + + // get corresponding discussion archives + Iterable discArchives = wiki.getDiscussionArchives(rev.getArticleID()); + + /* + * for each discussion archive, find correct revision of discussion page + */ + for (Page discArchive : discArchives) { + // get revision timestamps for the current discussion archive + List discussionTs = revApi.getRevisionTimestamps(discArchive.getPageId()); + + // sort in reverse order - newest first + discussionTs.sort(Comparator.reverseOrder()); + + // find first timestamp equal to or before the article revision timestamp + for (Timestamp curDiscTime : discussionTs) { + if (curDiscTime == revTime || curDiscTime.before(revTime)) { + result.add(revApi.getRevision(discArchive.getPageId(), curDiscTime)); + break; + } + } } - } - } - return result; - } + return result; + } } diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java index b892b03a..70d3cec3 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/RevisionPair.java @@ -27,154 +27,183 @@ import org.dkpro.jwpl.util.templates.parser.SectionExtractor.ExtractedSection; /** - * Represents a pair of (adjacent) revisions. In the second pair part (=after) a - * template has been added or removed (depending on the mode). + * Represents a pair of (adjacent) revisions. In the second pair part (=after) a template has been + * added or removed (depending on the mode). */ -public class RevisionPair implements Serializable { +public class RevisionPair + implements Serializable +{ - private static final long serialVersionUID = -1958556838478438963L; - private final Revision before; - private final Revision after; - private final String template; - private final RevisionPairType revPairType; + private static final long serialVersionUID = -1958556838478438963L; + private final Revision before; + private final Revision after; + private final String template; + private final RevisionPairType revPairType; - public RevisionPair(Revision before, Revision after, String template, - RevisionPairType revPairType) { - this.before = before; - this.after = after; - this.template = template; - this.revPairType = revPairType; - } + public RevisionPair(Revision before, Revision after, String template, + RevisionPairType revPairType) + { + this.before = before; + this.after = after; + this.template = template; + this.revPairType = revPairType; + } - /** - * @return revision before the template change - */ - public Revision getBeforeRevision() { - return before; - } + /** + * @return revision before the template change + */ + public Revision getBeforeRevision() + { + return before; + } - /** - * @return revision after the template change - */ - public Revision getAfterRevision() { - return after; - } + /** + * @return revision after the template change + */ + public Revision getAfterRevision() + { + return after; + } - /** - * @return the template that has been added or removed - */ - public String getTemplate() { - return template; - } + /** + * @return the template that has been added or removed + */ + public String getTemplate() + { + return template; + } - /** - * @return the type of template change - */ - public RevisionPairType getType() { - return revPairType; - } + /** + * @return the type of template change + */ + public RevisionPairType getType() + { + return revPairType; + } - /** - * Returns the text "around the given template" and returns the corresponding - * text in the other pair part of the RevisionPair. - *

- * Currently, this is done section-based. On TextPairPart contains a section - * with a template and the other contains the corresponding section - * after the template has been deleted (in deleteTemplate mode) or before - * it has been added (in addTemplate mode). - *

- * Note that this only makes sense for inline- or section-templates. - *

- * The section-matching is currently done simply by matching section titles. - * If the title has changed, no match will be found. - * - * @param markTemplates sets whether to add an inline marker for the template - * @return a pair of strings corresponding to the before-revision and - * after-revision - */ - public List getInlineTextPairs(boolean markTemplates) { - List pairList = new ArrayList<>(); + /** + * Returns the text "around the given template" and returns the corresponding text in the other + * pair part of the RevisionPair. + *

+ * Currently, this is done section-based. On TextPairPart contains a section with a template and + * the other contains the corresponding section after the template has been deleted (in + * deleteTemplate mode) or before it has been added (in addTemplate mode). + *

+ * Note that this only makes sense for inline- or section-templates. + *

+ * The section-matching is currently done simply by matching section titles. If the title has + * changed, no match will be found. + * + * @param markTemplates + * sets whether to add an inline marker for the template + * @return a pair of strings corresponding to the before-revision and after-revision + */ + public List getInlineTextPairs(boolean markTemplates) + { + List pairList = new ArrayList<>(); - try { - //extract sections - List beforeSections; - List afterSections; - if (markTemplates) { - //add inline marker for the template - beforeSections = ParseUtils.getSections(before.getRevisionText(), before.getRevisionID() + "", before.getRevisionID(), Arrays.asList(new String[]{template})); - afterSections = ParseUtils.getSections(after.getRevisionText(), after.getRevisionID() + "", after.getRevisionID(), Arrays.asList(new String[]{template})); - } else { - //no inline markers - beforeSections = ParseUtils.getSections(before.getRevisionText(), before.getRevisionID() + "", before.getRevisionID()); - afterSections = ParseUtils.getSections(after.getRevisionText(), after.getRevisionID() + "", after.getRevisionID()); - } - for (ExtractedSection tplSect : revPairType == RevisionPairType.deleteTemplate ? beforeSections : afterSections) { - // in DELETE-mode, the "before" revision contain the templates - // in ADD-mode, the "after" revision contains the templates - if (containsIgnoreCase(tplSect.getTemplates(), template)) { - // the current sect contains the template we're looking for - // now find the corresponding tpl in the other revisions - for (ExtractedSection nonTplSect : revPairType == RevisionPairType.deleteTemplate ? afterSections : beforeSections) { - // TODO how do we match the sections? - // currently only by title - we could do fuzzy matching - // of the section body - if (tplSect.getTitle() != null && nonTplSect.getTitle() != null && tplSect.getTitle().equalsIgnoreCase(nonTplSect.getTitle())) { - if (revPairType == RevisionPairType.deleteTemplate) { - pairList.add(new TextPair(tplSect.getBody(), nonTplSect.getBody())); - } else { - pairList.add(new TextPair(nonTplSect.getBody(), tplSect.getBody())); - } + try { + // extract sections + List beforeSections; + List afterSections; + if (markTemplates) { + // add inline marker for the template + beforeSections = ParseUtils.getSections(before.getRevisionText(), + before.getRevisionID() + "", before.getRevisionID(), + Arrays.asList(new String[] { template })); + afterSections = ParseUtils.getSections(after.getRevisionText(), + after.getRevisionID() + "", after.getRevisionID(), + Arrays.asList(new String[] { template })); + } + else { + // no inline markers + beforeSections = ParseUtils.getSections(before.getRevisionText(), + before.getRevisionID() + "", before.getRevisionID()); + afterSections = ParseUtils.getSections(after.getRevisionText(), + after.getRevisionID() + "", after.getRevisionID()); + } + for (ExtractedSection tplSect : revPairType == RevisionPairType.deleteTemplate + ? beforeSections + : afterSections) { + // in DELETE-mode, the "before" revision contain the templates + // in ADD-mode, the "after" revision contains the templates + if (containsIgnoreCase(tplSect.getTemplates(), template)) { + // the current sect contains the template we're looking for + // now find the corresponding tpl in the other revisions + for (ExtractedSection nonTplSect : revPairType == RevisionPairType.deleteTemplate + ? afterSections + : beforeSections) { + // TODO how do we match the sections? + // currently only by title - we could do fuzzy matching + // of the section body + if (tplSect.getTitle() != null && nonTplSect.getTitle() != null + && tplSect.getTitle().equalsIgnoreCase(nonTplSect.getTitle())) { + if (revPairType == RevisionPairType.deleteTemplate) { + pairList.add(new TextPair(tplSect.getBody(), nonTplSect.getBody())); + } + else { + pairList.add(new TextPair(nonTplSect.getBody(), tplSect.getBody())); + } + } + } + } } - } } - } - } catch (Exception ex) { - //This happens if a (SWEBLE-)compiler exception occurs.S - //Sometimes, malformed xml items seem to cause class cast exceptions - //in the parser, which is not wrapped in a Compiler exception. - //Therefore, we should catch all exceptions here and return the - //TextPairs identified so far (if any) - System.err.println(ex.getMessage()); - //TODO use logger!! + catch (Exception ex) { + // This happens if a (SWEBLE-)compiler exception occurs.S + // Sometimes, malformed xml items seem to cause class cast exceptions + // in the parser, which is not wrapped in a Compiler exception. + // Therefore, we should catch all exceptions here and return the + // TextPairs identified so far (if any) + System.err.println(ex.getMessage()); + // TODO use logger!! + } + return pairList; } - return pairList; - } - /** - * Checks if a list of string contains a String while ignoring case - * - * @param stringlist a list of string - * @param match the string to look for - * @return true, if the list contains the string, false else - */ - private boolean containsIgnoreCase(List stringlist, String match) { - for (String s : stringlist) { - if (s.equalsIgnoreCase(match)) { - return true; - } + /** + * Checks if a list of string contains a String while ignoring case + * + * @param stringlist + * a list of string + * @param match + * the string to look for + * @return true, if the list contains the string, false else + */ + private boolean containsIgnoreCase(List stringlist, String match) + { + for (String s : stringlist) { + if (s.equalsIgnoreCase(match)) { + return true; + } + } + return false; } - return false; - } - - public enum RevisionPairType { - deleteTemplate, addTemplate - } + public enum RevisionPairType + { + deleteTemplate, addTemplate + } - @Override - public boolean equals(Object anObject) { - if (!(anObject instanceof RevisionPair)) { - return false; - } else { - RevisionPair otherPair = (RevisionPair) anObject; - if (this.getBeforeRevision().getRevisionID() == otherPair.getBeforeRevision().getRevisionID() && - this.getAfterRevision().getRevisionID() == otherPair.getAfterRevision().getRevisionID() && - this.getTemplate().equals(otherPair.getTemplate()) && - this.getType() == otherPair.getType()) { - return true; - } else { - return false; - } + @Override + public boolean equals(Object anObject) + { + if (!(anObject instanceof RevisionPair)) { + return false; + } + else { + RevisionPair otherPair = (RevisionPair) anObject; + if (this.getBeforeRevision().getRevisionID() == otherPair.getBeforeRevision() + .getRevisionID() + && this.getAfterRevision().getRevisionID() == otherPair.getAfterRevision() + .getRevisionID() + && this.getTemplate().equals(otherPair.getTemplate()) + && this.getType() == otherPair.getType()) { + return true; + } + else { + return false; + } + } } - } } diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java index ea162c76..8e676e2e 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/TextPair.java @@ -36,268 +36,289 @@ /** * Represents a pair of Strings. Usually corresponding to a RevisionPair. */ -public class TextPair { - private String beforeText; - private String afterText; - /** - * Holds arbitrary String-MetaData - */ - private Map metaData; - - public TextPair(String before, String after) { - this.beforeText = normalize(before); - this.afterText = normalize(after); - setMetaData(new HashMap<>()); - } - - public String getBeforeText() { - return beforeText; - } - - public List getBeforeLines() { - return sentenceSplit(beforeText); - } - - public void setBeforeText(String beforeText) { - this.beforeText = normalize(beforeText); - } - - public String getAfterText() { - return afterText; - } - - public List getAfterLines() { - return sentenceSplit(afterText); - } - - public void setAfterText(String afterText) { - this.afterText = normalize(afterText); - } - - public Map getMetaData() { - return metaData; - } - - public void setMetaData(Map metaData) { - this.metaData = metaData; - } - - public void addMetaData(String key, String value) { - metaData.put(key, value); - } - - public String getMetaDataValue(String key) { - return metaData.get(key); - } - - - /** - * Returns the patch object that contains all diffs between - * the beforeText and the afterText - * - * @return Patch object with all diffs - */ - public Patch getPatch() { - return DiffUtils.diff(sentenceSplit(beforeText), sentenceSplit(afterText)); - } - - public List getDiffRows(boolean markChangesInline) { - DiffRowGenerator generator = new DiffRowGenerator.Builder() - .showInlineDiffs(markChangesInline) - .columnWidth(Integer.MAX_VALUE) // do not wrap - .build(); - - return generator.generateDiffRows(sentenceSplit(beforeText), sentenceSplit(afterText)); - } - - public String getInlineDiffString() { - StringBuilder diffString = new StringBuilder(); - for (DiffRow row : getDiffRows(true)) { - diffString.append(row.toString()); - diffString.append(System.getProperty("line.separator")); +public class TextPair +{ + private String beforeText; + private String afterText; + /** + * Holds arbitrary String-MetaData + */ + private Map metaData; + + public TextPair(String before, String after) + { + this.beforeText = normalize(before); + this.afterText = normalize(after); + setMetaData(new HashMap<>()); } - return diffString.toString(); - } - - - /** - * Returns the deltas between beforeText and afterText as a line separated String - * using delta.toString() - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @return diffs as line-separated String using delta.toString() - */ - public String getSimpleDiffString() { - StringBuilder deltas = new StringBuilder(); - for (Delta delta : getPatch().getDeltas()) { - deltas.append(delta.toString()); - deltas.append(System.getProperty("line.separator")); + + public String getBeforeText() + { + return beforeText; + } + + public List getBeforeLines() + { + return sentenceSplit(beforeText); + } + + public void setBeforeText(String beforeText) + { + this.beforeText = normalize(beforeText); + } + + public String getAfterText() + { + return afterText; + } + + public List getAfterLines() + { + return sentenceSplit(afterText); + } + + public void setAfterText(String afterText) + { + this.afterText = normalize(afterText); + } + + public Map getMetaData() + { + return metaData; + } + + public void setMetaData(Map metaData) + { + this.metaData = metaData; + } + + public void addMetaData(String key, String value) + { + metaData.put(key, value); + } + + public String getMetaDataValue(String key) + { + return metaData.get(key); + } + + /** + * Returns the patch object that contains all diffs between the beforeText and the afterText + * + * @return Patch object with all diffs + */ + public Patch getPatch() + { + return DiffUtils.diff(sentenceSplit(beforeText), sentenceSplit(afterText)); + } + + public List getDiffRows(boolean markChangesInline) + { + DiffRowGenerator generator = new DiffRowGenerator.Builder() + .showInlineDiffs(markChangesInline).columnWidth(Integer.MAX_VALUE) // do not wrap + .build(); + + return generator.generateDiffRows(sentenceSplit(beforeText), sentenceSplit(afterText)); + } + + public String getInlineDiffString() + { + StringBuilder diffString = new StringBuilder(); + for (DiffRow row : getDiffRows(true)) { + diffString.append(row.toString()); + diffString.append(System.getProperty("line.separator")); + } + return diffString.toString(); + } + + /** + * Returns the deltas between beforeText and afterText as a line separated String using + * delta.toString() For more detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @return diffs as line-separated String using delta.toString() + */ + public String getSimpleDiffString() + { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + deltas.append(delta.toString()); + deltas.append(System.getProperty("line.separator")); + } + return deltas.toString(); + } + + /** + * Returns the deltas between beforeText and afterText as a line separated String using + * delta.toString() For more detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @param difftype + * defines the type of diffs to include in the String + * @return diffs as line-separated String using delta.toString() + */ + public String getSimpleDiffString(TYPE difftype) + { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + if (delta.getType() == difftype) { + deltas.append(delta); + deltas.append(System.getProperty("line.separator")); + } + } + return deltas.toString(); } - return deltas.toString(); - } - - /** - * Returns the deltas between beforeText and afterText as a line separated String - * using delta.toString() - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @param difftype defines the type of diffs to include in the String - * @return diffs as line-separated String using delta.toString() - */ - public String getSimpleDiffString(TYPE difftype) { - StringBuilder deltas = new StringBuilder(); - for (Delta delta : getPatch().getDeltas()) { - if (delta.getType() == difftype) { - deltas.append(delta); - deltas.append(System.getProperty("line.separator")); - } + + /** + * Returns the deltas between beforeText and afterText as a line separated String. For more + * detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @return diffs as line-separated String + */ + public String getLongDiffString() + { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + deltas.append("DeltaType: " + delta.getType().toString()); + deltas.append(System.getProperty("line.separator")); + deltas.append("Original (Non-Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getOriginal()); + deltas.append(System.getProperty("line.separator")); + deltas.append(System.getProperty("line.separator")); + deltas.append("Revised (Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getRevised()); + deltas.append(System.getProperty("line.separator")); + } + return deltas.toString(); + } + + /** + * Returns the deltas between beforeText and afterText as a line separated String. For more + * detailed diffs, use getPatch() or getUnifiedDiffStrings() + * + * @param diffType + * defines the type of diffs to include in the String + * @return diffs as line-separated String + */ + public String getLongDiffString(TYPE diffType) + { + StringBuilder deltas = new StringBuilder(); + for (Delta delta : getPatch().getDeltas()) { + if (delta.getType() == diffType) { + deltas.append("Original (Non-Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getOriginal()); + deltas.append(System.getProperty("line.separator")); + deltas.append(System.getProperty("line.separator")); + deltas.append("Revised (Neutral):"); + deltas.append(System.getProperty("line.separator")); + deltas.append(delta.getRevised()); + deltas.append(System.getProperty("line.separator")); + deltas.append("*********************************************"); + deltas.append(System.getProperty("line.separator")); + } + } + return deltas.toString(); + } + + /** + * Returns the unified diff between "Before" and "After" containing one sentence per String. + * contextSize defines a window of lines/sentences around each change to display + * + * @param contextSize + * numer of lines/sentences around a change to display + * @return diffs as line-separated String + */ + public List getUnifiedDiffStrings(int contextSize) + { + return DiffUtils.generateUnifiedDiff("Before", "After", sentenceSplit(beforeText), + getPatch(), contextSize); } - return deltas.toString(); - } - - /** - * Returns the deltas between beforeText and afterText as a line separated String. - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @return diffs as line-separated String - */ - public String getLongDiffString() { - StringBuilder deltas = new StringBuilder(); - for (Delta delta : getPatch().getDeltas()) { - deltas.append("DeltaType: " + delta.getType().toString()); - deltas.append(System.getProperty("line.separator")); - deltas.append("Original (Non-Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getOriginal()); - deltas.append(System.getProperty("line.separator")); - deltas.append(System.getProperty("line.separator")); - deltas.append("Revised (Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getRevised()); - deltas.append(System.getProperty("line.separator")); + + /** + * Returns the unified diff between "Before" and "After" as a single line-separated String + * + * @param contextSize + * numer of characters around a change to display + * @return diffs as line-separated String + */ + public String getUnifiedDiffString(int contextSize) + { + return listToString(getUnifiedDiffStrings(contextSize)); } - return deltas.toString(); - } - - /** - * Returns the deltas between beforeText and afterText as a line separated String. - * For more detailed diffs, use getPatch() or getUnifiedDiffStrings() - * - * @param diffType defines the type of diffs to include in the String - * @return diffs as line-separated String - */ - public String getLongDiffString(TYPE diffType) { - StringBuilder deltas = new StringBuilder(); - for (Delta delta : getPatch().getDeltas()) { - if (delta.getType() == diffType) { - deltas.append("Original (Non-Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getOriginal()); - deltas.append(System.getProperty("line.separator")); - deltas.append(System.getProperty("line.separator")); - deltas.append("Revised (Neutral):"); - deltas.append(System.getProperty("line.separator")); - deltas.append(delta.getRevised()); - deltas.append(System.getProperty("line.separator")); - deltas.append("*********************************************"); - deltas.append(System.getProperty("line.separator")); - } + + /** + * Splits a String into sentences using the BreakIterator with US locale + * + * @param str + * a String with (multiple) sentences + * @return a list of Strings - one sentences per String + */ + private List sentenceSplit(String str) + { + BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); + iterator.setText(str); + int start = iterator.first(); + List sentences = new ArrayList<>(); + for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator + .next()) { + sentences.add(str.substring(start, end).trim()); + } + return sentences; } - return deltas.toString(); - } - - - /** - * Returns the unified diff between "Before" and "After" - * containing one sentence per String. - * contextSize defines a window of lines/sentences around each change - * to display - * - * @param contextSize numer of lines/sentences around a change to display - * @return diffs as line-separated String - */ - public List getUnifiedDiffStrings(int contextSize) { - return DiffUtils.generateUnifiedDiff("Before", "After", sentenceSplit(beforeText), getPatch(), contextSize); - } - - /** - * Returns the unified diff between "Before" and "After" as a single - * line-separated String - * - * @param contextSize numer of characters around a change to display - * @return diffs as line-separated String - */ - public String getUnifiedDiffString(int contextSize) { - return listToString(getUnifiedDiffStrings(contextSize)); - } - - - /** - * Splits a String into sentences using the BreakIterator with - * US locale - * - * @param str a String with (multiple) sentences - * @return a list of Strings - one sentences per String - */ - private List sentenceSplit(String str) { - BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); - iterator.setText(str); - int start = iterator.first(); - List sentences = new ArrayList<>(); - for (int end = iterator.next(); - end != BreakIterator.DONE; - start = end, end = iterator.next()) { - sentences.add(str.substring(start, end).trim()); + + /** + * Concatenates a list of Strings to one line-separated String + * + * @param stringList + * a list of Strings + * @return a single line-separated String containing all Strings from the list + */ + private String listToString(List stringList) + { + StringBuilder concat = new StringBuilder(); + for (String str : stringList) { + concat.append(str); + concat.append(System.getProperty("line.separator")); + } + return concat.toString(); } - return sentences; - } - - /** - * Concatenates a list of Strings to one line-separated String - * - * @param stringList a list of Strings - * @return a single line-separated String containing all Strings from the list - */ - private String listToString(List stringList) { - StringBuilder concat = new StringBuilder(); - for (String str : stringList) { - concat.append(str); - concat.append(System.getProperty("line.separator")); + + /** + * Normalizes the Strings in the TextPair. This mainly deals with whitespace-issues. Other + * normalizations can be included. + * + * @param str + * @return + */ + private String normalize(String str) + { + str = StringUtils.trimToEmpty(str); + str = StringUtils.normalizeSpace(str); + + // remove whitespace before punctuation. not using \p{Punct}, + // because it includes to many special characters. + str = str.replaceAll("\\s+(?=[.!,\\?;:])", ""); + + return str; } - return concat.toString(); - } - - /** - * Normalizes the Strings in the TextPair. - * This mainly deals with whitespace-issues. - * Other normalizations can be included. - * - * @param str - * @return - */ - private String normalize(String str) { - str = StringUtils.trimToEmpty(str); - str = StringUtils.normalizeSpace(str); - - // remove whitespace before punctuation. not using \p{Punct}, - // because it includes to many special characters. - str = str.replaceAll("\\s+(?=[.!,\\?;:])", ""); - - return str; - } - - @Override - public boolean equals(Object anObject) { - if (!(anObject instanceof TextPair)) { - return false; - } else { - TextPair otherPair = (TextPair) anObject; - if (this.getBeforeText().equals(otherPair.getBeforeText()) && this.getAfterText().equals(otherPair.getAfterText())) { - return true; - } else { - return false; - } + + @Override + public boolean equals(Object anObject) + { + if (!(anObject instanceof TextPair)) { + return false; + } + else { + TextPair otherPair = (TextPair) anObject; + if (this.getBeforeText().equals(otherPair.getBeforeText()) + && this.getAfterText().equals(otherPair.getAfterText())) { + return true; + } + else { + return false; + } + } } - } } \ No newline at end of file diff --git a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java index e48e2ea9..fdcf53a1 100644 --- a/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java +++ b/dkpro-jwpl-util/src/main/java/org/dkpro/jwpl/util/templates/WikipediaTemplateInfo.java @@ -52,1446 +52,1694 @@ import org.dkpro.jwpl.util.templates.generator.GeneratorConstants; /** - * This class gives access to the additional information created by - * the TemplateInfoGenerator. + * This class gives access to the additional information created by the TemplateInfoGenerator. */ -public class WikipediaTemplateInfo { +public class WikipediaTemplateInfo +{ - private final Wikipedia wiki; - private RevisionApi revApi = null; - private MediaWikiParser parser = null; + private final Wikipedia wiki; + private RevisionApi revApi = null; + private MediaWikiParser parser = null; - private Connection connection; + private Connection connection; - /** - * - */ - public WikipediaTemplateInfo(Wikipedia pWiki) throws SQLException, WikiApiException { - this.wiki = pWiki; - this.connection = getConnection(wiki); - - if (!tableExists(GeneratorConstants.TABLE_TPLID_TPLNAME)) { - System.err.println("No Template Database could be found. You can only use methods that work without a template index"); - } - } - - /** - * Returns the number of all pages that contain a template the name - * of which starts with any of the the given Strings. - * - * @param templateFragments a list Strings containing the beginnings of the desired templates - * @param whitelist whether to return pages containing these templates (true) or return pages - * NOT containing these templates (false) - * @return the number of pages that contain any template starting with templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) - */ - private Integer countFragmentFilteredPages(List templateFragments, boolean whitelist) throws WikiApiException { - try { - int count = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT distinct(count(*)) FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " as tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - for (@SuppressWarnings("unused") String fragment : templateFragments) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - - statement = connection.prepareStatement(sqlString.toString()); + /** + * + */ + public WikipediaTemplateInfo(Wikipedia pWiki) throws SQLException, WikiApiException + { + this.wiki = pWiki; + this.connection = getConnection(wiki); - int curIdx = 1; - for (String fragment : templateFragments) { - fragment = fragment.toLowerCase(); - fragment = fragment.trim(); - fragment = fragment.replaceAll(" ", "_"); - statement.setString(curIdx++, fragment + "%"); + if (!tableExists(GeneratorConstants.TABLE_TPLID_TPLNAME)) { + System.err.println( + "No Template Database could be found. You can only use methods that work without a template index"); } + } - result = execute(statement); + /** + * Returns the number of all pages that contain a template the name of which starts with any of + * the the given Strings. + * + * @param templateFragments + * a list Strings containing the beginnings of the desired templates + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return the number of pages that contain any template starting with templateFragment + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + private Integer countFragmentFilteredPages(List templateFragments, boolean whitelist) + throws WikiApiException + { + try { + int count = 0; + PreparedStatement statement = null; + ResultSet result = null; - if (result == null) { - return 0; - } + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append( + "SELECT distinct(count(*)) FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " as tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + for (@SuppressWarnings("unused") + String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase(); + fragment = fragment.trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } + + result = execute(statement); + + if (result == null) { + return 0; + } + + if (result.next()) { + count = result.getInt(1); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - if (result.next()) { - count = result.getInt(1); - } - } finally { - if (statement != null) { - statement.close(); + return count; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } + } - return count; - } catch (Exception e) { - throw new WikiApiException(e); + /** + * Returns the number of all pages that contain a template the name of which starts with any of + * the the given Strings. + * + * @param templateFragments + * a list Strings containing the beginnings of the desired templates + * @return the number of pages that contain any template starting with templateFragment + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public Integer countPagesContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return countFragmentFilteredPages(templateFragments, true); } - } - - /** - * Returns the number of all pages that contain a template the name - * of which starts with any of the the given Strings. - * - * @param templateFragments a list Strings containing the beginnings of the desired templates - * @return the number of pages that contain any template starting with templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) - */ - public Integer countPagesContainingTemplateFragments(List templateFragments) throws WikiApiException { - return countFragmentFilteredPages(templateFragments, true); - } - - /** - * Returns the number of all pages that contain a template the name - * of which starts with any of the the given Strings. - * - * @param templateFragments a list Strings containing the beginnings of the desired templates - * @return the number of pages that contain any template starting with templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most likely if the template templates are corrupted) - */ - public Integer countPagesNotContainingTemplateFragments(List templateFragments) throws WikiApiException { - return countFragmentFilteredPages(templateFragments, false); - } - - - /** - * Returns the number of all pages that contain a template the name of which - * equals the given String. - * - * @param templateNames a list of String containing the beginnings of the templates that have to be matched - * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return the number of pages that contain a template starting with - * any templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private Integer countFilteredPages(List templateNames, boolean whitelist) - throws WikiApiException { - - try { - int count = 0; - PreparedStatement statement = null; - ResultSet result = null; - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString - .append("SELECT distinct(count(*)) FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " as tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - - for (@SuppressWarnings("unused") String name : templateNames) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - statement = connection.prepareStatement(sqlString.toString()); + /** + * Returns the number of all pages that contain a template the name of which starts with any of + * the the given Strings. + * + * @param templateFragments + * a list Strings containing the beginnings of the desired templates + * @return the number of pages that contain any template starting with templateFragment + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public Integer countPagesNotContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return countFragmentFilteredPages(templateFragments, false); + } - int curIdx = 1; - for (String name : templateNames) { - name = name.toLowerCase().trim(); - name = name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); - } + /** + * Returns the number of all pages that contain a template the name of which equals the given + * String. + * + * @param templateNames + * a list of String containing the beginnings of the templates that have to be + * matched + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return the number of pages that contain a template starting with any templateFragment + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + private Integer countFilteredPages(List templateNames, boolean whitelist) + throws WikiApiException + { - result = execute(statement); + try { + int count = 0; + PreparedStatement statement = null; + ResultSet result = null; - if (result == null) { - return 0; - } + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append( + "SELECT distinct(count(*)) FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " as tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") + String name : templateNames) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName = ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String name : templateNames) { + name = name.toLowerCase().trim(); + name = name.replaceAll(" ", "_"); + statement.setString(curIdx++, name); + } + + result = execute(statement); + + if (result == null) { + return 0; + } + + if (result.next()) { + count = result.getInt(1); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - if (result.next()) { - count = result.getInt(1); + return count; } - } finally { - if (statement != null) { - statement.close(); + catch (Exception e) { + throw new WikiApiException(e); } - if (result != null) { - result.close(); - } - } + } - return count; - } catch (Exception e) { - throw new WikiApiException(e); + /** + * Returns the number of all pages that contain a template the name of which equals the given + * String. + * + * @param templateNames + * a list of String containing the beginnings of the templates that have to be + * matched + * @return the number of pages that contain a template starting with any templateFragment + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public Integer countPagesContainingTemplateNames(List templateNames) + throws WikiApiException + { + return countFilteredPages(templateNames, true); } - } - - /** - * Returns the number of all pages that contain a template the name of which - * equals the given String. - * - * @param templateNames a list of String containing the beginnings of the templates that have to be matched - * @return the number of pages that contain a template starting with - * any templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Integer countPagesContainingTemplateNames(List templateNames) throws WikiApiException { - return countFilteredPages(templateNames, true); - } - - /** - * Returns the number of all pages that do not contain a template the name of which - * equals the given String. - * - * @param templateNames a list of String containing the beginnings of the templates that have to be matched - * @return the number of pages that do not contain a template starting with - * any templateFragment - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Integer countPagesNotContainingTemplateNames(List templateNames) throws WikiApiException { - return countFilteredPages(templateNames, false); - } - - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An iterable with the page objects that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private Iterable getFragmentFilteredPages(List templateFragments, boolean whitelist) throws WikiApiException { - - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM " + - GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID - + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - - for (@SuppressWarnings("unused") String fragment : templateFragments) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - statement = connection.prepareStatement(sqlString.toString()); + /** + * Returns the number of all pages that do not contain a template the name of which equals the + * given String. + * + * @param templateNames + * a list of String containing the beginnings of the templates that have to be + * matched + * @return the number of pages that do not contain a template starting with any templateFragment + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public Integer countPagesNotContainingTemplateNames(List templateNames) + throws WikiApiException + { + return countFilteredPages(templateNames, false); + } - int curIdx = 1; - for (String fragment : templateFragments) { - fragment = fragment.toLowerCase().trim(); - fragment = fragment.replaceAll(" ", "_"); - statement.setString(curIdx++, fragment + "%"); - } + /** + * Return an iterable containing all pages that contain a template the name of which starts with + * any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return An iterable with the page objects that contain templates beginning with any String in + * templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + private Iterable getFragmentFilteredPages(List templateFragments, + boolean whitelist) + throws WikiApiException + { - result = execute(statement); + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") + String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase().trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + int pageID = result.getInt(1); + matchedPages.add(wiki.getPage(pageID)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - while (result.next()) { - int pageID = result.getInt(1); - matchedPages.add(wiki.getPage(pageID)); - } - } finally { - if (statement != null) { - statement.close(); + return matchedPages; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } - - return matchedPages; - } catch (Exception e) { - throw new WikiApiException(e); } - } - public int checkTemplateId(String templateName) throws WikiApiException { - try { - PreparedStatement statement = null; - ResultSet result = null; + public int checkTemplateId(String templateName) throws WikiApiException + { + try { + PreparedStatement statement = null; + ResultSet result = null; - try { - StringBuffer sqlString = new StringBuffer(); + try { + StringBuffer sqlString = new StringBuffer(); + sqlString.append( + "SELECT tpl.templateId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl WHERE tpl.templateName='" + + templateName.trim().replaceAll(" ", "_") + "'"); - sqlString.append("SELECT tpl.templateId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl WHERE tpl.templateName='" + templateName.trim().replaceAll(" ", "_") + "'"); + statement = connection.prepareStatement(sqlString.toString()); - statement = connection.prepareStatement(sqlString.toString()); + result = execute(statement); - result = execute(statement); + if (result == null) { + return -1; + } - if (result == null) { - return -1; - } + if (result.next()) { + return result.getInt(1); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } - if (result.next()) { - return result.getInt(1); - } - } finally { - if (statement != null) { - statement.close(); + } + + return -1; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } + } - } + /** + * Return an iterable containing all pages that contain a template the name of which starts with + * any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @return An iterable with the page objects that contain templates beginning with any String in + * templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public Iterable getPagesContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return getFragmentFilteredPages(templateFragments, true); + } - return -1; - } catch (Exception e) { - throw new WikiApiException(e); + /** + * Return an iterable containing all pages that contain a template the name of which starts with + * any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @return An iterable with the page objects that contain templates beginning with any String in + * templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public Iterable getPagesNotContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return getFragmentFilteredPages(templateFragments, false); } - } - - - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @return An iterable with the page objects that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesContainingTemplateFragments(List templateFragments) throws WikiApiException { - return getFragmentFilteredPages(templateFragments, true); - } - - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @return An iterable with the page objects that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesNotContainingTemplateFragments(List templateFragments) throws WikiApiException { - return getFragmentFilteredPages(templateFragments, false); - } - - - /** - * Return an iterable containing all pages that contain a template the name - * of which starts with any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @param whitelist whether to return pages containing these templates (true) or return pages - * NOT containing these templates (false) - * @return An iterable with the page objects that contain any of the the - * specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private Iterable getFilteredPages(List templateNames, boolean whitelist) throws WikiApiException { - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - - for (@SuppressWarnings("unused") String name : templateNames) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); - statement = connection.prepareStatement(sqlString.toString()); + /** + * Return an iterable containing all pages that contain a template the name of which starts with + * any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return An iterable with the page objects that contain any of the the specified templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + private Iterable getFilteredPages(List templateNames, boolean whitelist) + throws WikiApiException + { + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); - int curIdx = 1; - for (String name : templateNames) { - name = name.toLowerCase().trim(); - name = name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") + String name : templateNames) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName = ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String name : templateNames) { + name = name.toLowerCase().trim(); + name = name.replaceAll(" ", "_"); + statement.setString(curIdx++, name); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + int pageID = result.getInt(1); + matchedPages.add(wiki.getPage(pageID)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } + + return matchedPages; + } + catch (Exception e) { + throw new WikiApiException(e); } + } - result = execute(statement); + /** + * Return an iterable containing all pages that contain a template the name of which equals any + * of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @return An iterable with the page objects that contain any of the the specified templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public Iterable getPagesContainingTemplateNames(List templateNames) + throws WikiApiException + { + return getFilteredPages(templateNames, true); + } - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } + /** + * Return an iterable containing all pages that do NOT contain a template the name of which + * equals of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @return An iterable with the page objects that do NOT contain any of the the specified + * templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public Iterable getPagesNotContainingTemplateNames(List templateNames) + throws WikiApiException + { + return getFilteredPages(templateNames, false); + } - while (result.next()) { - int pageID = result.getInt(1); - matchedPages.add(wiki.getPage(pageID)); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); + /** + * This method first creates a list of pages containing templates that equal any of the provided + * Strings. It then returns a list of revision ids of the revisions in which the respective + * templates first appeared. + * + * @param templateName + * the template names that have to be matched + * @return An list with the revision ids of the first appearance of the template + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getRevisionsWithFirstTemplateAppearance(String templateName) + throws WikiApiException + { + /* + * Note: This method does not use any revision-template-index. Each revision has to be + * parsed until the first revision is found that does not contain a certain template. TODO + * also create version using revision-template index + */ + System.err + .println("Note: This function call demands parsing several revision for each page. " + + "A method using the revision-template index is currently under construction."); + + templateName = templateName.trim().replaceAll(" ", "_"); + + List revisionIds = new LinkedList<>(); + List pageIds = getPageIdsContainingTemplateNames( + Arrays.asList(new String[] { templateName })); + if (pageIds.size() == 0) { + return revisionIds; + } + if (revApi == null) { + revApi = new RevisionApi(wiki.getDatabaseConfiguration()); + } + if (parser == null) { + // TODO switch to SWEBLE + MediaWikiParserFactory pf = new MediaWikiParserFactory( + wiki.getDatabaseConfiguration().getLanguage()); + pf.setTemplateParserClass(ShowTemplateNamesAndParameters.class); + parser = pf.createParser(); + } + + for (int id : pageIds) { + // get timestamps of all revisions + List tsList = revApi.getRevisionTimestamps(id); + + // sort in reverse order - newest first + tsList.sort(Comparator.reverseOrder()); + + Revision prevRev = null; + tsloop: for (Timestamp ts : tsList) { + + Revision rev = revApi.getRevision(id, ts); + + // initialize previous revision + if (prevRev == null) { + prevRev = rev; + } + + // Parse templates and check if the revision contains the template + ParsedPage pp = parser.parse(rev.getRevisionText()); + boolean containsTpl = false; + tplLoop: for (Template tpl : pp.getTemplates()) { + if (tpl.getName().equalsIgnoreCase(templateName)) { + containsTpl = true; + break tplLoop; + } + } + + // if the revision does not contain the template, we have found + // what we were looking for. add id of previous revision + if (!containsTpl) { + revisionIds.add(prevRev.getRevisionID()); + break tsloop; + } + prevRev = rev; + } } - } - return matchedPages; - } catch (Exception e) { - throw new WikiApiException(e); + return revisionIds; } - } - - /** - * Return an iterable containing all pages that contain a template the name - * of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @return An iterable with the page objects that contain any of the the - * specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesContainingTemplateNames(List templateNames) throws WikiApiException { - return getFilteredPages(templateNames, true); - } - - /** - * Return an iterable containing all pages that do NOT contain a template - * the name of which equals of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @return An iterable with the page objects that do NOT contain any of the - * the specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public Iterable getPagesNotContainingTemplateNames(List templateNames) throws WikiApiException { - return getFilteredPages(templateNames, false); - } - - - /** - * This method first creates a list of pages containing templates that equal - * any of the provided Strings. - * It then returns a list of revision ids of the revisions in which the - * respective templates first appeared. - * - * @param templateName the template names that have to be matched - * @return An list with the revision ids of the first appearance of the template - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getRevisionsWithFirstTemplateAppearance(String templateName) throws WikiApiException { - /* - * Note: - * This method does not use any revision-template-index. Each revision has to be parsed - * until the first revision is found that does not contain a certain template. - * TODO also create version using revision-template index + + ////////// + + /** + * Returns a list containing the ids of all pages that contain a template the name of which + * starts with any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return An list with the ids of the pages that contain templates beginning with any String in + * templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) */ - System.err.println("Note: This function call demands parsing several revision for each page. " + - "A method using the revision-template index is currently under construction."); + private List getFragmentFilteredPageIds(List templateFragments, + boolean whitelist) + throws WikiApiException + { + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); - templateName = templateName.trim().replaceAll(" ", "_"); + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + for (@SuppressWarnings("unused") + String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase().trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + matchedPages.add(result.getInt(1)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - List revisionIds = new LinkedList<>(); - List pageIds = getPageIdsContainingTemplateNames(Arrays.asList(new String[]{templateName})); - if (pageIds.size() == 0) { - return revisionIds; - } - if (revApi == null) { - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - if (parser == null) { - //TODO switch to SWEBLE - MediaWikiParserFactory pf = new MediaWikiParserFactory( - wiki.getDatabaseConfiguration().getLanguage()); - pf.setTemplateParserClass(ShowTemplateNamesAndParameters.class); - parser = pf.createParser(); + return matchedPages; + } + catch (Exception e) { + throw new WikiApiException(e); + } } - for (int id : pageIds) { - //get timestamps of all revisions - List tsList = revApi.getRevisionTimestamps(id); + /** + * Returns a list containing the ids of all pages that contain a template the name of which + * starts with any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @return An list with the ids of the pages that contain templates beginning with any String in + * templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public List getPageIdsContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return getFragmentFilteredPageIds(templateFragments, true); + } - // sort in reverse order - newest first - tsList.sort(Comparator.reverseOrder()); + /** + * Returns a list containing the ids of all pages that contain a template the name of which + * starts with any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @return An list with the ids of the pages that do not contain templates beginning with any + * String in templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public List getPageIdsNotContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return getFragmentFilteredPageIds(templateFragments, false); + } - Revision prevRev = null; - tsloop: - for (Timestamp ts : tsList) { + /////////////////// + + /** + * Returns a list containing the ids of all revisions that contain a template the name of which + * starts with any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return An list with the ids of the revisions that contain templates beginning with any + * String in templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + private List getFragmentFilteredRevisionIds(List templateFragments, + boolean whitelist) + throws WikiApiException + { - Revision rev = revApi.getRevision(id, ts); + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); - //initialize previous revision - if (prevRev == null) { - prevRev = rev; - } + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString + .append("SELECT r.revisionId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_REVISIONID + + " AS r WHERE tpl.templateId = r.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + for (@SuppressWarnings("unused") + String fragment : templateFragments) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName LIKE ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String fragment : templateFragments) { + fragment = fragment.toLowerCase().trim(); + fragment = fragment.replaceAll(" ", "_"); + statement.setString(curIdx++, fragment + "%"); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + matchedPages.add(result.getInt(1)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - //Parse templates and check if the revision contains the template - ParsedPage pp = parser.parse(rev.getRevisionText()); - boolean containsTpl = false; - tplLoop: - for (Template tpl : pp.getTemplates()) { - if (tpl.getName().equalsIgnoreCase(templateName)) { - containsTpl = true; - break tplLoop; - } + return matchedPages; } - - //if the revision does not contain the template, we have found - //what we were looking for. add id of previous revision - if (!containsTpl) { - revisionIds.add(prevRev.getRevisionID()); - break tsloop; + catch (Exception e) { + throw new WikiApiException(e); } - prevRev = rev; - } } - return revisionIds; - } - - - ////////// - - - /** - * Returns a list containing the ids of all pages that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An list with the ids of the pages that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFragmentFilteredPageIds(List templateFragments, boolean whitelist) - throws WikiApiException { - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - for (@SuppressWarnings("unused") String fragment : templateFragments) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); + /** + * Returns a list containing the ids of all revisions that contain a template the name of which + * starts with any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @return An list with the ids of the revisions that contain templates beginning with any + * String in templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public List getRevisionIdsContainingTemplateFragments(List templateFragments) + throws WikiApiException + { + return getFragmentFilteredRevisionIds(templateFragments, true); + } - statement = connection.prepareStatement(sqlString.toString()); + /** + * Returns a list containing the ids of all revisions that contain a template the name of which + * starts with any of the given Strings. + * + * @param templateFragments + * the beginning of the templates that have to be matched + * @return An list with the ids of the revisions that do not contain templates beginning with + * any String in templateFragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public List getRevisionIdsNotContainingTemplateFragments( + List templateFragments) + throws WikiApiException + { + return getFragmentFilteredRevisionIds(templateFragments, false); + } - int curIdx = 1; - for (String fragment : templateFragments) { - fragment = fragment.toLowerCase().trim(); - fragment = fragment.replaceAll(" ", "_"); - statement.setString(curIdx++, fragment + "%"); + /////////////////// + + /** + * Returns the ids of all pages that ever contained any of the given template names in the + * history of their existence. + * + * @param templateNames + * template names to look for + * @return list of page ids of the pages that once contained any of the given template names + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public List getIdsOfPagesThatEverContainedTemplateNames(List templateNames) + throws WikiApiException + { + if (revApi == null) { + revApi = new RevisionApi(wiki.getDatabaseConfiguration()); } + Set pageIdSet = new HashSet<>(); - result = execute(statement); - - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); + // TODO instead of getting rev ids and then getting page ids, do one query and make the join + // in the db directly + List revsWithTemplate = getRevisionIdsContainingTemplateNames(templateNames); + for (int revId : revsWithTemplate) { + pageIdSet.add(revApi.getPageIdForRevisionId(revId)); } - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); + return new LinkedList<>(pageIdSet); + } + + /** + * Returns the ids of all pages that ever contained any template that started with any of the + * given template fragments + * + * @param templateFragments + * template-fragments to look for + * @return list of page ids of the pages that once contained any template that started with any + * of the given template fragments + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the template + * templates are corrupted) + */ + public List getIdsOfPagesThatEverContainedTemplateFragments( + List templateFragments) + throws WikiApiException + { + if (revApi == null) { + revApi = new RevisionApi(wiki.getDatabaseConfiguration()); } - } + Set pageIdSet = new HashSet<>(); - return matchedPages; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - /** - * Returns a list containing the ids of all pages that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @return An list with the ids of the pages that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getPageIdsContainingTemplateFragments(List templateFragments) throws WikiApiException { - return getFragmentFilteredPageIds(templateFragments, true); - } - - /** - * Returns a list containing the ids of all pages that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @return An list with the ids of the pages that do not contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getPageIdsNotContainingTemplateFragments(List templateFragments) throws WikiApiException { - return getFragmentFilteredPageIds(templateFragments, false); - } - - /////////////////// - - /** - * Returns a list containing the ids of all revisions that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @param whitelist whether to return pages containing these templates (true) or return pages NOT containing these templates (false) - * @return An list with the ids of the revisions that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFragmentFilteredRevisionIds(List templateFragments, boolean whitelist) throws WikiApiException { - - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT r.revisionId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_REVISIONID + " AS r WHERE tpl.templateId = r.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - for (@SuppressWarnings("unused") String fragment : templateFragments) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName LIKE ?"); + // TODO instead of getting rev ids and then getting page ids, do one query and make the join + // in the db directly + List revsWithTemplate = getRevisionIdsContainingTemplateFragments( + templateFragments); + for (int revId : revsWithTemplate) { + pageIdSet.add(revApi.getPageIdForRevisionId(revId)); } - sqlString.append(subconditions); - sqlString.append(")"); - statement = connection.prepareStatement(sqlString.toString()); + List pageIds = new LinkedList<>(); + pageIds.addAll(pageIdSet); - int curIdx = 1; - for (String fragment : templateFragments) { - fragment = fragment.toLowerCase().trim(); - fragment = fragment.replaceAll(" ", "_"); - statement.setString(curIdx++, fragment + "%"); - } + return pageIds; + } - result = execute(statement); + /////////////////// + + /** + * Returns a list containing the ids of all pages that contain a template the name of which + * equals any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return A list with the ids of all pages that contain any of the the specified templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + private List getFilteredPageIds(List templateNames, boolean whitelist) + throws WikiApiException + { + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") + String name : templateNames) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName = ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String name : templateNames) { + name = name.toLowerCase().trim(); + name = name.replaceAll(" ", "_"); + statement.setString(curIdx++, name); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + matchedPages.add(result.getInt(1)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } finally { - if (statement != null) { - statement.close(); + return matchedPages; } - if (result != null) { - result.close(); + catch (Exception e) { + throw new WikiApiException(e); } - } - - return matchedPages; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - - /** - * Returns a list containing the ids of all revisions that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @return An list with the ids of the revisions that contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getRevisionIdsContainingTemplateFragments(List templateFragments) throws WikiApiException { - return getFragmentFilteredRevisionIds(templateFragments, true); - } - - /** - * Returns a list containing the ids of all revisions that contain a - * template the name of which starts with any of the given Strings. - * - * @param templateFragments the beginning of the templates that have to be matched - * @return An list with the ids of the revisions that do not contain templates - * beginning with any String in templateFragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getRevisionIdsNotContainingTemplateFragments(List templateFragments) throws WikiApiException { - return getFragmentFilteredRevisionIds(templateFragments, false); - } - - - /////////////////// - - - /** - * Returns the ids of all pages that ever contained any of the given template names in the history of their existence. - * - * @param templateNames template names to look for - * @return list of page ids of the pages that once contained any of the given template names - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getIdsOfPagesThatEverContainedTemplateNames(List templateNames) throws WikiApiException { - if (revApi == null) { - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - Set pageIdSet = new HashSet<>(); - - //TODO instead of getting rev ids and then getting page ids, do one query and make the join in the db directly - List revsWithTemplate = getRevisionIdsContainingTemplateNames(templateNames); - for (int revId : revsWithTemplate) { - pageIdSet.add(revApi.getPageIdForRevisionId(revId)); } - return new LinkedList<>(pageIdSet); - } - - /** - * Returns the ids of all pages that ever contained any template that started with any of the given template fragments - * - * @param templateFragments template-fragments to look for - * @return list of page ids of the pages that once contained any template that started with any of the given template fragments - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the template templates are corrupted) - */ - public List getIdsOfPagesThatEverContainedTemplateFragments(List templateFragments) throws WikiApiException { - if (revApi == null) { - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); + /** + * Returns a list containing the ids of all pages that contain a template the name of which + * equals any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @return A list with the ids of all pages that contain any of the the specified templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getPageIdsContainingTemplateNames(List templateNames) + throws WikiApiException + { + return getFilteredPageIds(templateNames, true); } - Set pageIdSet = new HashSet<>(); - //TODO instead of getting rev ids and then getting page ids, do one query and make the join in the db directly - List revsWithTemplate = getRevisionIdsContainingTemplateFragments(templateFragments); - for (int revId : revsWithTemplate) { - pageIdSet.add(revApi.getPageIdForRevisionId(revId)); + /** + * Returns a list containing the ids of all pages that do not contain a template the name of + * which equals any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @return A list with the ids of all pages that do not contain any of the the specified + * templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getPageIdsNotContainingTemplateNames(List templateNames) + throws WikiApiException + { + return getFilteredPageIds(templateNames, false); } - List pageIds = new LinkedList<>(); - pageIds.addAll(pageIdSet); - - return pageIds; - } - - /////////////////// - - - /** - * Returns a list containing the ids of all pages that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @param whitelist whether to return pages containing these templates (true) or return pages - * NOT containing these templates (false) - * @return A list with the ids of all pages that contain any of the the - * specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFilteredPageIds(List templateNames, boolean whitelist) throws WikiApiException { - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT p.pageId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_PAGEID + " AS p WHERE tpl.templateId = p.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - - for (@SuppressWarnings("unused") String name : templateNames) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); - } - sqlString.append(subconditions); - sqlString.append(")"); + /** + * Returns a list containing the ids of all revisions that contain a template the name of which + * equals any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @param whitelist + * whether to return pages containing these templates (true) or return pages NOT + * containing these templates (false) + * @return A list with the ids of all revisions that contain any of the the specified templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + private List getFilteredRevisionIds(List templateNames, boolean whitelist) + throws WikiApiException + { + try { + PreparedStatement statement = null; + ResultSet result = null; + List matchedPages = new LinkedList<>(); - statement = connection.prepareStatement(sqlString.toString()); + try { + StringBuffer sqlString = new StringBuffer(); + StringBuffer subconditions = new StringBuffer(); + sqlString + .append("SELECT r.revisionId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_REVISIONID + + " AS r WHERE tpl.templateId = r.templateId " + + (whitelist ? "AND" : "AND NOT") + " ("); + + for (@SuppressWarnings("unused") + String name : templateNames) { + if (subconditions.length() != 0) { + subconditions.append("OR "); + } + subconditions.append("tpl.templateName = ?"); + } + sqlString.append(subconditions); + sqlString.append(")"); + + statement = connection.prepareStatement(sqlString.toString()); + + int curIdx = 1; + for (String name : templateNames) { + name = name.toLowerCase().trim(); + name = name.replaceAll(" ", "_"); + statement.setString(curIdx++, name); + } + + result = execute(statement); + + if (result == null) { + throw new WikiPageNotFoundException("Nothing was found"); + } + + while (result.next()) { + matchedPages.add(result.getInt(1)); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - int curIdx = 1; - for (String name : templateNames) { - name = name.toLowerCase().trim(); - name = name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); + return matchedPages; + } + catch (Exception e) { + throw new WikiApiException(e); } + } - result = execute(statement); + /** + * Returns a list containing the ids of all revisions that contain a template the name of which + * equals any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @return A list with the ids of all revisions that contain any of the specified templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getRevisionIdsContainingTemplateNames(List templateNames) + throws WikiApiException + { + return getFilteredRevisionIds(templateNames, true); + } - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } + /** + * Returns a list containing the ids of all revisions that do not contain a template the name of + * which equals any of the given Strings. + * + * @param templateNames + * the names of the template that we want to match + * @return A list with the ids of all revisions that do not contain any of the specified + * templates + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getRevisionIdsNotContainingTemplateNames(List templateNames) + throws WikiApiException + { + return getFilteredRevisionIds(templateNames, false); + } - while (result.next()) { - matchedPages.add(result.getInt(1)); - } - } finally { - if (statement != null) { - statement.close(); + /** + * Returns the names of all templates contained in the specified page. + * + * @param page + * the page object for which the templates should be retrieved + * @return A List with the names of the templates contained in the specified page + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getTemplateNamesFromPage(Page page) throws WikiApiException + { + return getTemplateNamesFromPage(page.getPageId()); + } + + /** + * Returns the names of all templates contained in the specified page. + * + * @param pageTitle + * the title of the page for which the templates should be retrieved + * @return A List with the names of the templates contained in the specified page + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getTemplateNamesFromPage(String pageTitle) throws WikiApiException + { + Page p; + try { + p = wiki.getPage(pageTitle); } - if (result != null) { - result.close(); + catch (WikiApiException e) { + return new ArrayList<>(); } - } - - return matchedPages; - } catch (Exception e) { - throw new WikiApiException(e); + return getTemplateNamesFromPage(p); } - } - - - /** - * Returns a list containing the ids of all pages that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @return A list with the ids of all pages that contain any of the the - * specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getPageIdsContainingTemplateNames(List templateNames) throws WikiApiException { - return getFilteredPageIds(templateNames, true); - } - - /** - * Returns a list containing the ids of all pages that do not contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @return A list with the ids of all pages that do not contain any of the the - * specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getPageIdsNotContainingTemplateNames(List templateNames) throws WikiApiException { - return getFilteredPageIds(templateNames, false); - } - - - /** - * Returns a list containing the ids of all revisions that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @param whitelist whether to return pages containing these templates (true) or return pages - * NOT containing these templates (false) - * @return A list with the ids of all revisions that contain any of the the - * specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - private List getFilteredRevisionIds(List templateNames, boolean whitelist) throws WikiApiException { - try { - PreparedStatement statement = null; - ResultSet result = null; - List matchedPages = new LinkedList<>(); - - try { - StringBuffer sqlString = new StringBuffer(); - StringBuffer subconditions = new StringBuffer(); - sqlString.append("SELECT r.revisionId FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_REVISIONID + " AS r WHERE tpl.templateId = r.templateId " + (whitelist ? "AND" : "AND NOT") + " ("); - - for (@SuppressWarnings("unused") String name : templateNames) { - if (subconditions.length() != 0) { - subconditions.append("OR "); - } - subconditions.append("tpl.templateName = ?"); + + /** + * Returns the names of all templates contained in the specified page. + * + * @param pageId + * the id of the Wiki page + * @return A List with the names of the templates contained in the specified page + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getTemplateNamesFromPage(int pageId) throws WikiApiException + { + if (pageId < 1) { + throw new WikiApiException("Page ID must be > 0"); } - sqlString.append(subconditions); - sqlString.append(")"); + try { + PreparedStatement statement = null; + ResultSet result = null; + List templateNames = new LinkedList<>(); - statement = connection.prepareStatement(sqlString.toString()); + try { + statement = connection.prepareStatement( + "SELECT tpl.templateName FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID + + " AS p WHERE tpl.templateId = p.templateId AND p.pageId = ?"); + statement.setInt(1, pageId); - int curIdx = 1; - for (String name : templateNames) { - name = name.toLowerCase().trim(); - name = name.replaceAll(" ", "_"); - statement.setString(curIdx++, name); - } + result = execute(statement); - result = execute(statement); + if (result == null) { + return templateNames; + } - if (result == null) { - throw new WikiPageNotFoundException("Nothing was found"); - } + while (result.next()) { + templateNames.add(result.getString(1).toLowerCase()); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - while (result.next()) { - matchedPages.add(result.getInt(1)); + return templateNames; } - } finally { - if (statement != null) { - statement.close(); + catch (Exception e) { + throw new WikiApiException(e); } - if (result != null) { - result.close(); + } + + /** + * Returns the names of all templates contained in the specified revision. + * + * @param revid + * the revision id + * @return A List with the names of the templates contained in the specified revision + * @throws WikiApiException + * If there was any error retrieving the page object (most likely if the templates + * are corrupted) + */ + public List getTemplateNamesFromRevision(int revid) throws WikiApiException + { + if (revid < 1) { + throw new WikiApiException("Revision ID must be > 0"); } - } + try { + PreparedStatement statement = null; + ResultSet result = null; + List templateNames = new LinkedList<>(); - return matchedPages; - } catch (Exception e) { - throw new WikiApiException(e); - } - } - - - /** - * Returns a list containing the ids of all revisions that contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @return A list with the ids of all revisions that contain any of the specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getRevisionIdsContainingTemplateNames(List templateNames) throws WikiApiException { - return getFilteredRevisionIds(templateNames, true); - } - - /** - * Returns a list containing the ids of all revisions that do not contain a template - * the name of which equals any of the given Strings. - * - * @param templateNames the names of the template that we want to match - * @return A list with the ids of all revisions that do not contain any of the specified templates - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getRevisionIdsNotContainingTemplateNames(List templateNames) throws WikiApiException { - return getFilteredRevisionIds(templateNames, false); - } - - - /** - * Returns the names of all templates contained in the specified page. - * - * @param page the page object for which the templates should be retrieved - * @return A List with the names of the templates contained in the specified page - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromPage(Page page) throws WikiApiException { - return getTemplateNamesFromPage(page.getPageId()); - } - - /** - * Returns the names of all templates contained in the specified page. - * - * @param pageTitle the title of the page for which the templates should be - * retrieved - * @return A List with the names of the templates contained in the specified page - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromPage(String pageTitle) throws WikiApiException { - Page p; - try { - p = wiki.getPage(pageTitle); - } catch (WikiApiException e) { - return new ArrayList<>(); - } - return getTemplateNamesFromPage(p); - } - - - /** - * Returns the names of all templates contained in the specified page. - * - * @param pageId the id of the Wiki page - * @return A List with the names of the templates contained in the specified page - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromPage(int pageId) throws WikiApiException { - if (pageId < 1) { - throw new WikiApiException("Page ID must be > 0"); - } - try { - PreparedStatement statement = null; - ResultSet result = null; - List templateNames = new LinkedList<>(); + try { + statement = connection.prepareStatement( + "SELECT tpl.templateName FROM " + GeneratorConstants.TABLE_TPLID_TPLNAME + + " AS tpl, " + GeneratorConstants.TABLE_TPLID_REVISIONID + + " AS p WHERE tpl.templateId = p.templateId AND p.revisionId = ?"); + statement.setInt(1, revid); - try { - statement = connection.prepareStatement("SELECT tpl.templateName FROM " - + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " + GeneratorConstants.TABLE_TPLID_PAGEID - + " AS p WHERE tpl.templateId = p.templateId AND p.pageId = ?"); - statement.setInt(1, pageId); + result = execute(statement); - result = execute(statement); + if (result == null) { + return templateNames; + } - if (result == null) { - return templateNames; - } + while (result.next()) { + templateNames.add(result.getString(1).toLowerCase()); + } + } + finally { + if (statement != null) { + statement.close(); + } + if (result != null) { + result.close(); + } + } - while (result.next()) { - templateNames.add(result.getString(1).toLowerCase()); + return templateNames; } - } finally { - if (statement != null) { - statement.close(); + catch (Exception e) { + throw new WikiApiException(e); } - if (result != null) { - result.close(); - } - } - - return templateNames; - } catch (Exception e) { - throw new WikiApiException(e); } - } - - - /** - * Returns the names of all templates contained in the specified revision. - * - * @param revid the revision id - * @return A List with the names of the templates contained in the specified - * revision - * @throws WikiApiException If there was any error retrieving the page object (most - * likely if the templates are corrupted) - */ - public List getTemplateNamesFromRevision(int revid) throws WikiApiException { - if (revid < 1) { - throw new WikiApiException("Revision ID must be > 0"); + + /** + * Determines whether a given revision contains a given template name + * + * @param revId + * @param templateName + * a template name + * @return {@code true} if the revision contains {@code templateName}, {@code false} otherwise. + * @throws WikiApiException + */ + public boolean revisionContainsTemplateName(int revId, String templateName) + throws WikiApiException + { + return revisionContainsTemplateNames(revId, Arrays.asList(new String[] { templateName })); } - try { - PreparedStatement statement = null; - ResultSet result = null; - List templateNames = new LinkedList<>(); - - try { - statement = connection.prepareStatement("SELECT tpl.templateName FROM " - + GeneratorConstants.TABLE_TPLID_TPLNAME + " AS tpl, " - + GeneratorConstants.TABLE_TPLID_REVISIONID - + " AS p WHERE tpl.templateId = p.templateId AND p.revisionId = ?"); - statement.setInt(1, revid); - - result = execute(statement); - - if (result == null) { - return templateNames; - } - while (result.next()) { - templateNames.add(result.getString(1).toLowerCase()); - } - } finally { - if (statement != null) { - statement.close(); - } - if (result != null) { - result.close(); + /** + * Determines whether a given revision contains a given template name + * + * @param revId + * @param templateNames + * a list of template names + * @return {@code true} if the revision contains one element in {@code templateNames}, + * {@code false} otherwise. + * @throws WikiApiException + */ + public boolean revisionContainsTemplateNames(int revId, List templateNames) + throws WikiApiException + { + List tplList = getTemplateNamesFromRevision(revId); + for (String tpl : tplList) { + for (String templateName : templateNames) { + if (tpl.equalsIgnoreCase(templateName)) { + return true; + } + } } - } - - return templateNames; - } catch (Exception e) { - throw new WikiApiException(e); + return false; } - } - - - /** - * Determines whether a given revision contains a given template name - * - * @param revId - * @param templateName a template name - * @return {@code true} if the revision contains {@code templateName}, {@code false} otherwise. - * @throws WikiApiException - */ - public boolean revisionContainsTemplateName(int revId, String templateName) throws WikiApiException { - return revisionContainsTemplateNames(revId, Arrays.asList(new String[]{templateName})); - } - - /** - * Determines whether a given revision contains a given template name - * - * @param revId - * @param templateNames a list of template names - * @return {@code true} if the revision contains one element in {@code templateNames}, {@code false} otherwise. - * @throws WikiApiException - */ - public boolean revisionContainsTemplateNames(int revId, List templateNames) throws WikiApiException { - List tplList = getTemplateNamesFromRevision(revId); - for (String tpl : tplList) { - for (String templateName : templateNames) { - if (tpl.equalsIgnoreCase(templateName)) { - return true; + + /** + * Determines whether a given revision contains a template starting with the given fragment + * + * @param revId + * @param templateFragment + * @return {@code true} if the revision contains {@code templateFragment}, {@code false} + * otherwise. + * @throws WikiApiException + */ + public boolean revisionContainsTemplateFragment(int revId, String templateFragment) + throws WikiApiException + { + List tplList = getTemplateNamesFromRevision(revId); + for (String tpl : tplList) { + if (tpl.toLowerCase().startsWith(templateFragment.toLowerCase())) { + return true; + } } - } - } - return false; - } - - /** - * Determines whether a given revision contains a template starting with the given fragment - * - * @param revId - * @param templateFragment - * @return {@code true} if the revision contains {@code templateFragment}, {@code false} otherwise. - * @throws WikiApiException - */ - public boolean revisionContainsTemplateFragment(int revId, String templateFragment) throws WikiApiException { - List tplList = getTemplateNamesFromRevision(revId); - for (String tpl : tplList) { - if (tpl.toLowerCase().startsWith(templateFragment.toLowerCase())) { - return true; - } - } - return false; - } - - /** - * Does the same as {@link #revisionContainsTemplateFragment(int, String)} without using a template index - * - * @param revId - * @param templateName - * @return {@code true} if the revision contains {@code templateName}, {@code false} otherwise. - * @throws WikiApiException - */ - public boolean revisionContainsTemplateNameWithoutIndex(int revId, String templateName) throws WikiApiException { - if (revApi == null) { - revApi = new RevisionApi(wiki.getDatabaseConfiguration()); - } - if (parser == null) { - //TODO switch to SWEBLE - MediaWikiParserFactory pf = new MediaWikiParserFactory(wiki.getDatabaseConfiguration().getLanguage()); - pf.setTemplateParserClass(ShowTemplateNamesAndParameters.class); - parser = pf.createParser(); + return false; } - List