diff --git a/bundle/pom.xml b/bundle/pom.xml index 5ea54793a4..8fffe60f11 100644 --- a/bundle/pom.xml +++ b/bundle/pom.xml @@ -58,6 +58,7 @@ com.day.cq.wcm.workflow.process;version="[6.0,8)", com.day.cq.mailer;version="[5.9,7)", com.adobe.cq.sightly;version="[2.5,4)", + javax.annotation;resolution:=optional, * guava @@ -283,7 +284,16 @@ 3.16 jar - + + org.apache.tika + tika-core + 1.14 + + + org.apache.tika + tika-parsers + 1.14 + org.mockito diff --git a/bundle/src/main/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksReport.java b/bundle/src/main/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksReport.java index 00f8938782..cd2a0a8f2d 100644 --- a/bundle/src/main/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksReport.java +++ b/bundle/src/main/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksReport.java @@ -1,18 +1,37 @@ +/* + * Copyright 2017 Adobe. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package com.adobe.acs.commons.mcp.impl.processes; import com.adobe.acs.commons.fam.ActionManager; import com.adobe.acs.commons.mcp.ProcessDefinition; import com.adobe.acs.commons.mcp.ProcessInstance; +import com.adobe.acs.commons.mcp.form.CheckboxComponent; import com.adobe.acs.commons.mcp.form.FormField; import com.adobe.acs.commons.mcp.form.PathfieldComponent; import com.adobe.acs.commons.mcp.model.GenericReport; import com.adobe.acs.commons.util.visitors.TreeFilteringResourceVisitor; -import org.apache.sling.api.resource.LoginException; -import org.apache.sling.api.resource.PersistenceException; -import org.apache.sling.api.resource.ResourceResolver; -import org.apache.sling.api.resource.ResourceUtil; +import org.apache.sling.api.resource.*; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.html.HtmlParser; +import org.apache.tika.sax.Link; +import org.apache.tika.sax.LinkContentHandler; import javax.jcr.RepositoryException; +import java.io.ByteArrayInputStream; import java.io.Serializable; import java.util.*; import java.util.regex.Pattern; @@ -20,6 +39,8 @@ import java.util.stream.Stream; /** + * Broken Links Checker MCP task + * * @author Yegor Kozlov */ public class BrokenLinksReport extends ProcessDefinition implements Serializable { @@ -44,12 +65,29 @@ public class BrokenLinksReport extends ProcessDefinition implements Serializable options = {"default=cq:allowedTemplates"}) private String excludeProperties; + @FormField( + name = "Deep check in html", + description = "If checked, links will be extracted from html field", + component = CheckboxComponent.class, + options = {"checked"} + ) + private boolean deepCheck = false; + + @FormField(name = "Fields containing html", + description = "Properties containing html to extract links", + required = false, + options = {"default=text"}) + private String htmlFields; + transient private Set excludeList; + transient private Set deepCheckList; transient private Pattern regex; @Override public void init() throws RepositoryException { excludeList = Arrays.stream(excludeProperties.split(",")).map(String::trim).collect(Collectors.toSet()); + deepCheckList = deepCheck ? Arrays.stream(htmlFields.split(",")).map(String::trim).collect(Collectors.toSet()) + : new HashSet<>(); regex = Pattern.compile(propertyRegex); } @@ -64,7 +102,7 @@ enum REPORT { @Override public void buildProcess(ProcessInstance instance, ResourceResolver rr) throws LoginException, RepositoryException { report.setName(instance.getName()); - instance.defineAction("Collect Broken References", rr, this::collectBrokenLinks); + instance.defineAction("Collect Broken References", rr, this::buildReport); instance.getInfo().setDescription(sourcePath); } @@ -77,43 +115,97 @@ public void storeReport(ProcessInstance instance, ResourceResolver rr) throws Re } - public void collectBrokenLinks(ActionManager manager) { + public void buildReport(ActionManager manager) { TreeFilteringResourceVisitor visitor = new TreeFilteringResourceVisitor(); visitor.setBreadthFirstMode(); visitor.setTraversalFilter(null); visitor.setResourceVisitor((resource, depth) -> { - ResourceResolver resolver = resource.getResourceResolver(); - resource.getValueMap().entrySet().stream() - .filter(entry -> !excludeList.contains(entry.getKey())) - .filter(entry -> entry.getValue() instanceof String || entry.getValue() instanceof String[]) - .forEach(entry -> { - - List paths = collectPaths(entry.getValue()) - .filter(path -> ResourceUtil.isNonExistingResource(resolver.resolve(path))) - .collect(Collectors.toList()); - if (!paths.isEmpty()) { - String propertyPath = resource.getPath() + "/" + entry.getKey(); - reportData.put(propertyPath, new EnumMap<>(REPORT.class)); - reportData.get(propertyPath).put(REPORT.reference, paths.stream().collect(Collectors.joining(","))); - } - - }); + Map> brokenRefs = collectBrokenReferences(resource, regex, excludeList, deepCheckList); + for(Map.Entry> ref : brokenRefs.entrySet()){ + String propertyPath = ref.getKey(); + List refs = ref.getValue(); + reportData.put(propertyPath, new EnumMap<>(REPORT.class)); + reportData.get(propertyPath).put(REPORT.reference, refs.stream().collect(Collectors.joining(","))); + + } }); manager.deferredWithResolver(rr -> visitor.accept(rr.getResource(sourcePath))); } - Stream collectPaths(Object p) { + /** + * Collect references from a JCR property. + * A property can be one of: + * + * A string containing a reference, e.g, fileReference=/content/dam/image.png. + * An array of strings, e.g, fileReference=[/content/dam/image1.png, /content/dam/image2.png] + * An html fragment containing links , e.g, + * + * <p> + * <a href="/content/site/page.html">hello</a> + * <img src="/content/dam/image1.png">hello</a> + * </p> + * + * + * + * + * @param property an entry from a ValueMap + * @param htmlFields lst of properties containing html + * @return stream containing extracted references + */ + static Stream collectPaths(Map.Entry property, Set htmlFields) { + Object p = property.getValue(); + Stream stream; - if (p.getClass().isArray()) { + if (p.getClass() == String[].class) { stream = Arrays.stream((String[]) p); + } else if (p.getClass() == String.class){ + stream = Stream.of((String) p); } else { - stream = Stream.of(p.toString()); + stream = Stream.empty(); + } + if (htmlFields.contains(property.getKey())) { + stream = stream.flatMap(val -> { + try { + // parse html and extract links via underlying tagsoup library + LinkContentHandler linkHandler = new LinkContentHandler(); + HtmlParser parser = new HtmlParser(); + parser.parse(new ByteArrayInputStream(val.getBytes("utf-8")), linkHandler, new Metadata(), new ParseContext()); + return linkHandler.getLinks().stream().map(Link::getUri); + } catch (Exception e) { + return Stream.empty(); + } + }); } - return stream.filter(val -> regex.matcher(val).matches()); + return stream; } - // access from unit tsts - Map> getReportData(){ + /** + * Collect broken references from properties of the given resource + * + * @param resource the resource to check + * @param regex regex to to detect properties containing references. Set from @FormField + * @param skipList properties to ignore. Set from @FormField + * @param htmlFields field containing html . + * @return broken references keyed by property. The value is a List because a property can contain multiple links, + * e.g. if it is multivalued or it is html containing multiple links. + */ + static Map> collectBrokenReferences(Resource resource, Pattern regex, Set skipList, Set htmlFields) { + + return resource.getValueMap().entrySet().stream() + .filter(entry -> !skipList.contains(entry.getKey())) + .collect(Collectors.toMap( + entry -> resource.getPath() + "/" + entry.getKey(), + entry -> { + List brokenPaths = collectPaths(entry, htmlFields) + .filter(href -> regex.matcher(href).matches()) + .filter(path -> ResourceUtil.isNonExistingResource(resource.getResourceResolver().resolve(path))) + .collect(Collectors.toList()); + return brokenPaths; + })).entrySet().stream().filter(e -> !e.getValue().isEmpty()) + .collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue())); + } + // access from unit tests + Map> getReportData() { return reportData; } } diff --git a/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/TestGenericReportExcelServlet.java b/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/TestGenericReportExcelServlet.java index 63a6cbbea3..0ee6300e2a 100644 --- a/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/TestGenericReportExcelServlet.java +++ b/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/TestGenericReportExcelServlet.java @@ -66,6 +66,7 @@ public void testReport() throws Exception { slingContext.addModelsForClasses(GenericReport.class); GenericReportExcelServlet servlet = new GenericReportExcelServlet(); + servlet.doGet(request, response); assertEquals("application/vnd.ms-excel", response.getContentType()); diff --git a/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksTest.java b/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksTest.java index 38f71e6f46..d579311ef9 100644 --- a/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksTest.java +++ b/bundle/src/test/java/com/adobe/acs/commons/mcp/impl/processes/BrokenLinksTest.java @@ -21,13 +21,12 @@ import com.adobe.acs.commons.mcp.ControlledProcessManager; import com.adobe.acs.commons.mcp.impl.AbstractResourceImpl; import com.adobe.acs.commons.mcp.impl.ProcessInstanceImpl; -import org.apache.sling.api.resource.LoginException; -import org.apache.sling.api.resource.NonExistingResource; -import org.apache.sling.api.resource.ResourceMetadata; -import org.apache.sling.api.resource.ResourceResolver; +import org.apache.sling.api.resource.*; import org.apache.sling.testing.mock.sling.ResourceResolverType; import org.apache.sling.testing.mock.sling.junit.SlingContext; import static com.adobe.acs.commons.mcp.impl.processes.BrokenLinksReport.REPORT; +import static com.adobe.acs.commons.mcp.impl.processes.BrokenLinksReport.collectBrokenReferences; +import static com.adobe.acs.commons.mcp.impl.processes.BrokenLinksReport.collectPaths; import org.junit.Before; import org.junit.Rule; import org.junit.Test; @@ -36,9 +35,9 @@ import javax.jcr.Session; import javax.jcr.security.AccessControlManager; import javax.jcr.security.Privilege; -import java.util.EnumMap; -import java.util.HashMap; -import java.util.Map; +import java.util.*; +import java.util.regex.Pattern; +import java.util.stream.Collectors; import static com.adobe.acs.commons.fam.impl.ActionManagerTest.*; import static org.junit.Assert.assertEquals; @@ -55,7 +54,7 @@ public class BrokenLinksTest { @Rule public final SlingContext slingContext = new SlingContext(ResourceResolverType.RESOURCERESOLVER_MOCK); - BrokenLinksReport tool; + private BrokenLinksReport tool; @Before public void setup() { @@ -151,4 +150,63 @@ private ControlledProcessManager getControlledProcessManager() throws LoginExcep when(cpm.getServiceResourceResolver()).thenReturn(getMockResolver()); return cpm; } + + @Test + public void testCollectPaths(){ + Set htmlFields = new HashSet<>(); + htmlFields.add("text"); + + assertEquals(Arrays.asList("/ref1"), collectPaths(property("fileReference", "/ref1"), htmlFields).collect(Collectors.toList())); + assertEquals(Arrays.asList("/ref1", "/ref2"), collectPaths(property("fileReference", new String[]{"/ref1", "/ref2"}), htmlFields).collect(Collectors.toList())); + assertEquals(Arrays.asList("/ref1"), collectPaths(property("text", "hello"), htmlFields).collect(Collectors.toList())); + } + + private Map.Entry property(String key, Object value){ + return new Map.Entry() { + @Override + public String getKey() { + return key; + } + + @Override + public Object getValue() { + return value; + } + + @Override + public Object setValue(Object value) { + return null; + } + }; + } + + @Test + public void testCollectBrokenReferences(){ + Pattern ptrn = Pattern.compile("/content/.+"); + Set skipList = new HashSet<>(Arrays.asList("skip1", "skip2")); + Set htmlFields = new HashSet<>(Arrays.asList("text")); + slingContext.build() + .resource("/test1", + "p1", "/content/ref1", + "p2", "/content/ref2", + "p3", new String[]{"/content/ref1"}, + "p4", new String[]{"/content/ref1", "/content/ref2"}, + "skip1", "/content/ref2") + .resource("/test2", + "text", "hellohello", + "skip2", "hellohello") + .resource("/content/ref1") + .commit(); + + Map> refs1 = collectBrokenReferences(slingContext.resourceResolver().getResource("/test1"), ptrn, skipList, htmlFields); + assertEquals(2, refs1.size()); + assertEquals(Arrays.asList("/content/ref2"), refs1.get("/test1/p2")); + assertEquals(Arrays.asList("/content/ref2"), refs1.get("/test1/p4")); + + Map> refs2 = collectBrokenReferences(slingContext.resourceResolver().getResource("/test2"), ptrn, skipList, htmlFields); + assertEquals(1, refs2.size()); + assertEquals(Arrays.asList("/content/ref2", "/content/ref3"), refs2.get("/test2/text")); + } + + }
+ * <p> + * <a href="/content/site/page.html">hello</a> + * <img src="/content/dam/image1.png">hello</a> + * </p> + *
hello
hellohello