Skip to content

Commit

Permalink
Issue #2876 - RobotsServlet Improvement (#2882)
Browse files Browse the repository at this point in the history
* Improvement for #2876

Two new OSGi config properties have been introduced:
- allow.page.property.names and
- disallow.page.property.names
to generate ALLOW and DISALLOW rules for single pages (path/page.html)

These can be combined with existing properties
- allow.property.names and
- disallow.property.names
to ALLOW single pages (path/page.html) while DISALLOWing nested paths (path/page/), or vice versa
  • Loading branch information
pahupe authored Aug 22, 2022
1 parent b08b098 commit 42ad0dc
Show file tree
Hide file tree
Showing 6 changed files with 164 additions and 35 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ The format is based on [Keep a Changelog](http://keepachangelog.com)
<!-- Keep this up to date! After a release, change the tag name to the latest release -->
[unreleased changes details]: https://github.com/Adobe-Consulting-Services/acs-aem-commons/compare/acs-aem-commons-5.0.14...HEAD

### Added

- #2876 - RobotsServlet: Added configuration options to ALLOW / DISALLOW single pages (.html)

### Changed

- #2874 - Make Marketo Forms Easy to configure
Expand Down
113 changes: 84 additions & 29 deletions bundle/src/main/java/com/adobe/acs/commons/wcm/impl/RobotsServlet.java
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ private void writeFromOsgiConfig(SlingHttpServletRequest request, SlingHttpServl
rules.getSitemaps().stream().map(sitemap -> buildSitemapDirective(sitemap, request, pageManager, request.getResourceResolver())).forEach(writer::println);
if (!rules.getSitemapProperties().isEmpty()) {
addRuleForPageHavingBooleanProperty(page, rules.getSitemapProperties(), writer,
(currentPage) -> buildSitemapDirective(currentPage.getPath(), request, pageManager, request.getResourceResolver()));
(currentPage) -> buildSitemapDirective(currentPage.getPath(), request, pageManager, request.getResourceResolver()), false);
}
} else {
response.sendError(HttpServletResponse.SC_BAD_REQUEST);
Expand All @@ -143,18 +143,28 @@ private void writeGroup(RobotsRuleGroup group, ResourceResolver resourceResolver
PageManager pageManager = page.getPageManager();
group.getUserAgents().stream().map(this::buildUserAgentsDirective).forEach(writer::println);

group.getAllowed().stream().map(allowed -> buildAllowedOrDisallowedDirective(true, allowed, pageManager, resourceResolver)).forEach(writer::println);
group.getAllowed().stream().map(allowed -> buildAllowedOrDisallowedDirective(true, allowed, pageManager, resourceResolver, false)).forEach(writer::println);

List<String> allowProperties = group.getAllowProperties();
if (!allowProperties.isEmpty()) {
addRuleForPageHavingBooleanProperty(page, allowProperties, writer, (currentPage) -> buildAllowedOrDisallowedDirective(true, currentPage.getPath(), pageManager, resourceResolver));
List<String> allowPageProperties = group.getAllowPageProperties();
if (!allowPageProperties.isEmpty()) {
addRuleForPageHavingBooleanProperty(page, allowPageProperties, writer, (currentPage) -> buildAllowedOrDisallowedDirective(true, currentPage.getPath(), pageManager, resourceResolver, true), false);
}

group.getDisallowed().stream().map(disallowed -> buildAllowedOrDisallowedDirective(false, disallowed, page.getPageManager(), resourceResolver)).forEach(writer::println);
List<String> allowPathProperties = group.getAllowPathProperties();
if (!allowPathProperties.isEmpty()) {
addRuleForPageHavingBooleanProperty(page, allowPathProperties, writer, (currentPage) -> buildAllowedOrDisallowedDirective(true, currentPage.getPath(), pageManager, resourceResolver, false), true);
}

group.getDisallowed().stream().map(disallowed -> buildAllowedOrDisallowedDirective(false, disallowed, page.getPageManager(), resourceResolver, false)).forEach(writer::println);

List<String> disallowProperties = group.getDisallowProperties();
if (!disallowProperties.isEmpty()) {
addRuleForPageHavingBooleanProperty(page, disallowProperties, writer, (currentPage) -> buildAllowedOrDisallowedDirective(false, currentPage.getPath(), pageManager, resourceResolver));
List<String> disallowPageProperties = group.getDisallowPageProperties();
if (!disallowPageProperties.isEmpty()) {
addRuleForPageHavingBooleanProperty(page, disallowPageProperties, writer, (currentPage) -> buildAllowedOrDisallowedDirective(false, currentPage.getPath(), pageManager, resourceResolver, true), false);
}

List<String> disallowPathProperties = group.getDisallowPathProperties();
if (!disallowPathProperties.isEmpty()) {
addRuleForPageHavingBooleanProperty(page, disallowPathProperties, writer, (currentPage) -> buildAllowedOrDisallowedDirective(false, currentPage.getPath(), pageManager, resourceResolver, false), true);
}

if (printGroupingComments) {
Expand All @@ -164,13 +174,14 @@ private void writeGroup(RobotsRuleGroup group, ResourceResolver resourceResolver

/**
* Recursively calls {@code ruleBuilderFunc} for the current page and all its children where one of the given propNames is having the boolean value {@code true}
* @param page the page from which to start
* @param resourceResolver the resourceResolver to use
* @param page the page from which to start
* @param propNames the list of property names on the page to evaluate
* @param writer contains the generated output
* @param ruleBuilderFunc generates the rule which is added to {@code writer}
* @param doNotRecurseIfRuleAdded set to false if generating a rule for path X should NOT prevent the recursion
* into subpaths of X.
*/
private void addRuleForPageHavingBooleanProperty(Page page, List<String> propNames, PrintWriter writer, Function<Page, String> ruleBuilderFunc) {
private void addRuleForPageHavingBooleanProperty(Page page, List<String> propNames, PrintWriter writer, Function<Page, String> ruleBuilderFunc, boolean doNotRecurseIfRuleAdded) {
ValueMap pageProps = page.getProperties();
boolean added = false;
for (String prop : propNames) {
Expand All @@ -183,10 +194,14 @@ private void addRuleForPageHavingBooleanProperty(Page page, List<String> propNam
}
}

if (!added) {
//since the rules are added to (dis)allow a page and it's children, we only need to recurse if no rule is added for the current page.
// Don't recurse into nested paths (stop here) if a rule has been added for this path AND this type of rule
// prevents recursing.
// Applies to path-based rules: There is no point in declaring ALLOW <subpath> if ALLOW <path> is declared.
// Same goes for DISALLOW <subpath>
// But for single page rules (ALLOW <page>), the path tree must be walked
if (!(added && doNotRecurseIfRuleAdded)) {
Iterator<Page> pageIterator = page.listChildren(new PageFilter(false, true), false);
pageIterator.forEachRemaining(child -> addRuleForPageHavingBooleanProperty(child, propNames, writer, ruleBuilderFunc));
pageIterator.forEachRemaining(child -> addRuleForPageHavingBooleanProperty(child, propNames, writer, ruleBuilderFunc, doNotRecurseIfRuleAdded));
}

}
Expand Down Expand Up @@ -263,10 +278,28 @@ private String getSitemapUrl(SlingHttpServletRequest request, Page page, Resourc
return sitemapRule;
}

private String buildAllowedOrDisallowedDirective(boolean isAllowed, String allowedOrDisallowedRule, PageManager pageManager, ResourceResolver resourceResolver) {
/**
* Generate an ALLOWED or DISALLOWED rule for either the a folder/path or for a single page
* @param isAllowed
* @param allowedOrDisallowedRule
* @param pageManager
* @param resourceResolver
* @param generatePagePath false: generate rule for the path (default case before this flag was introduced).
* true: generate a rule for a single page
* @return
*/
private String buildAllowedOrDisallowedDirective(boolean isAllowed, String allowedOrDisallowedRule, PageManager pageManager, ResourceResolver resourceResolver, boolean generatePagePath) {
Page page = pageManager.getContainingPage(allowedOrDisallowedRule);
if (page != null) {
allowedOrDisallowedRule = resourceResolver.map(page.getPath()) + "/";
// folder path
if (!generatePagePath) {
allowedOrDisallowedRule = resourceResolver.map(page.getPath()) + "/";
}
// single page
else {
allowedOrDisallowedRule = resourceResolver.map(page.getPath()) + ".html";
}

}
return (isAllowed ? ALLOW : DISALLOW) + allowedOrDisallowedRule;
}
Expand All @@ -289,15 +322,21 @@ private String buildAllowedOrDisallowedDirective(boolean isAllowed, String allow
@AttributeDefinition(name = "Allow Directives", description = "A set of Allow directives to add to the robots file. Each directive is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<allowed path>. If the specified path is a valid cq page, resourceResolver.map() will be called prior to adding the rule.")
String[] allow_directives() default {};

@AttributeDefinition(name = "Allow Property Names", description = "A list of boolean page properties which enable generation of an allow directive for that page. Any directives added through this method are in addition to those specified in the allow.directives property. Each property name is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<propertyName>")
@AttributeDefinition(name = "Allow Path Property Names", description = "A list of boolean page properties which enable generation of an allow directive for that path. Any directives added through this method are in addition to those specified in the allow.directives property. Each property name is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<propertyName>")
String[] allow_property_names() default {};

@AttributeDefinition(name = "Allow Page Property Names", description = "A list of boolean page properties which enable generation of an allow directive for that page. Any directives added through this method are in addition to those specified in the allow.directives property. Each property name is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<propertyName>")
String[] allow_page_property_names() default {};

@AttributeDefinition(name = "Disallow Directives", description = "A set of Disallow directives to add to the robots file. Each directive is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<disallowed path>. If the specified path is a valid cq page, resourceResolver.map() will be called prior to adding the rule.")
String[] disallow_directives() default {};

@AttributeDefinition(name = "Disallow Property Names", description = "A list of boolean page properties wich enable generation of a disallow directive for that page. Any directives added through this method are in addition to those specified in the disallowed.directives property. Each property name is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<propertyName>")
@AttributeDefinition(name = "Disallow Path Property Names", description = "A list of boolean page properties wich enable generation of a disallow directive for that path. Any directives added through this method are in addition to those specified in the disallowed.directives property. Each property name is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<propertyName>")
String[] disallow_property_names() default {};

@AttributeDefinition(name = "Disallow Page Property Names", description = "A list of boolean page properties wich enable generation of a disallow directive for that page. Any directives added through this method are in addition to those specified in the disallowed.directives property. Each property name is optionally pre-fixed with a ruleGroupName. Syntax: [<ruleGroupName>:]<propertyName>")
String[] disallow_page_property_names() default {};

@AttributeDefinition(name = "Sitemap Directives", description = "A set of Sitemap directives to add to the robots file. If the specified path is a valid AEM page path (either absolute or relative to the current page), externalizer is called with the specified Externalizer Domain to generate an absolute url to that page's .sitemap.xml, which will resolve to the ACS Commons Site Map Servlet.")
String[] sitemap_directives() default {};

Expand Down Expand Up @@ -333,11 +372,17 @@ public RobotsRuleSet(RobotsServletConfig config) {
String[] disallowed = config.disallow_directives();
processConfig(disallowed, group -> group.disallowed);

String[] disallowProps = config.disallow_property_names();
processConfig(disallowProps, group -> group.disallowProperties);
String[] disallowPathProps = config.disallow_property_names();
processConfig(disallowPathProps, group -> group.disallowPathProperties);

String[] disallowPageProps = config.disallow_page_property_names();
processConfig(disallowPageProps, group -> group.disallowPageProperties);

String[] allowPathProps = config.allow_property_names();
processConfig(allowPathProps, group -> group.allowPathProperties);

String[] allowProps = config.allow_property_names();
processConfig(allowProps, group -> group.allowProperties);
String[] allowPageProps = config.allow_page_property_names();
processConfig(allowPageProps, group -> group.allowPageProperties);
}

private void processConfig(String[] configs, Function<RobotsRuleGroup, List<String>> configListGetFunc) {
Expand Down Expand Up @@ -392,8 +437,10 @@ private class RobotsRuleGroup {
private List<String> userAgents = new ArrayList<>();
private List<String> allowed = new ArrayList<>();
private List<String> disallowed = new ArrayList<>();
private List<String> allowProperties = new ArrayList<>();
private List<String> disallowProperties = new ArrayList<>();
private List<String> allowPathProperties = new ArrayList<>();
private List<String> allowPageProperties = new ArrayList<>();
private List<String> disallowPathProperties = new ArrayList<>();
private List<String> disallowPageProperties = new ArrayList<>();

public String getGroupName() {
return groupName;
Expand All @@ -411,12 +458,20 @@ public List<String> getDisallowed() {
return Collections.unmodifiableList(disallowed);
}

public List<String> getAllowProperties() {
return Collections.unmodifiableList(allowProperties);
public List<String> getAllowPathProperties() {
return Collections.unmodifiableList(allowPathProperties);
}

public List<String> getAllowPageProperties() {
return Collections.unmodifiableList(allowPageProperties);
}

public List<String> getDisallowPathProperties() {
return Collections.unmodifiableList(disallowPathProperties);
}

public List<String> getDisallowProperties() {
return Collections.unmodifiableList(disallowProperties);
public List<String> getDisallowPageProperties() {
return Collections.unmodifiableList(disallowPageProperties);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,16 @@
package com.adobe.acs.commons.wcm.impl;

import com.day.cq.commons.Externalizer;
import com.day.cq.wcm.api.PageManager;
import com.day.cq.wcm.api.PageManagerFactory;

import io.wcm.testing.mock.aem.MockExternalizer;
import io.wcm.testing.mock.aem.MockPageManagerFactory;
import io.wcm.testing.mock.aem.junit.AemContext;
import org.apache.commons.io.IOUtils;
import org.apache.sling.api.resource.ResourceResolver;
import org.apache.sling.testing.mock.sling.ResourceResolverType;
import org.apache.sling.testing.mock.sling.servlet.MockSlingHttpServletRequest;
import org.apache.sling.testing.mock.sling.servlet.MockSlingHttpServletResponse;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.mockito.Mock;
import org.mockito.Mockito;
import org.mockito.junit.MockitoJUnit;
import org.mockito.junit.MockitoRule;

Expand Down Expand Up @@ -122,6 +116,58 @@ public void testWriteFromPageProperties() throws ServletException, IOException {
assertResponse("RobotsServlet_testWriteFromPageProperties.txt", response);
}

/**
* Disallow visiting the page, but allow visiting nested paths
*/
@Test
public void testAllowFolderDisallowPage() throws ServletException, IOException {
Map<String, Object> props = new HashMap<>();
props.put("sling.servlet.resourceTypes", "geometrixx/components/structure/page");
props.put("user.agent.directives", new String[]{
"*"
});
props.put("disallow.page.property.names", new String[]{
"disallowPage",
});
props.put("allow.property.names", new String[]{
"allowFolder"
});
props.put("sitemap.property.names", new String[]{
"isSiteMap"
});

RobotsServlet robotsServlet = context.registerInjectActivateService(new RobotsServlet(), props);
robotsServlet.doGet(request, response);
assertEquals("servlet returned an error", 200, response.getStatus());
assertResponse("RobotsServlet_testAllowFolderAndDisallowPage.txt", response);
}

/**
* Allow visiting the page, but disallow visiting nested paths
*/
@Test
public void testDisallowFolderAndAllowPage() throws ServletException, IOException {
Map<String, Object> props = new HashMap<>();
props.put("sling.servlet.resourceTypes", "geometrixx/components/structure/page");
props.put("user.agent.directives", new String[]{
"*"
});
props.put("disallow.property.names", new String[]{
"disallowFolder",
});
props.put("allow.page.property.names", new String[]{
"allowPage"
});
props.put("sitemap.property.names", new String[]{
"isSiteMap"
});

RobotsServlet robotsServlet = context.registerInjectActivateService(new RobotsServlet(), props);
robotsServlet.doGet(request, response);
assertEquals("servlet returned an error", 200, response.getStatus());
assertResponse("RobotsServlet_testDisallowFolderAndAllowPage.txt", response);
}

@Test
public void testWriteFromOsgiConfigSimple() throws ServletException, IOException {
Map<String, Object> props = new HashMap<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,22 @@
"jcr:primaryType": "cq:PageContent",
"denyGoogle": true
}
},
"allow-page-but-disallow-folder": {
"jcr:primaryType": "cq:Page",
"jcr:content": {
"jcr:primaryType": "cq:PageContent",
"allowPage": true,
"disallowFolder": true
}
},
"allow-folder-but-disallow-page": {
"jcr:primaryType": "cq:Page",
"jcr:content": {
"jcr:primaryType": "cq:PageContent",
"allowFolder": true,
"disallowPage": true
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
User-agent: *
Allow: /content/geometrixx/en/allow-folder-but-disallow-page/
Disallow: /content/geometrixx/en/allow-folder-but-disallow-page.html
Sitemap: https://www.geometrixx.com/content/geometrixx/en.sitemap.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
User-agent: *
Allow: /content/geometrixx/en/allow-page-but-disallow-folder.html
Disallow: /content/geometrixx/en/allow-page-but-disallow-folder/
Sitemap: https://www.geometrixx.com/content/geometrixx/en.sitemap.xml

0 comments on commit 42ad0dc

Please sign in to comment.