diff --git a/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java b/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java index c19037128..106b23710 100644 --- a/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java +++ b/beans/src/main/java/fr/pilato/elasticsearch/crawler/fs/beans/Attributes.java @@ -30,10 +30,12 @@ public class Attributes { static public final class FIELD_NAMES { public static final String OWNER = "owner"; public static final String GROUP = "group"; + public static final String PERMISSIONS = "permissions"; } private String owner; private String group; + private int permissions; public String getOwner() { return owner; @@ -50,4 +52,12 @@ public String getGroup() { public void setGroup(String group) { this.group = group; } + + public int getPermissions() { + return permissions; + } + + public void setPermissions(int permissions) { + this.permissions = permissions; + } } diff --git a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java index a2cd3a37e..4b53cd282 100644 --- a/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java +++ b/core/src/main/java/fr/pilato/elasticsearch/crawler/fs/FsParser.java @@ -238,27 +238,27 @@ private void addFilesRecursively(FileAbstractor path, String filepath, LocalD if (children != null) { for (FileAbstractModel child : children) { - String filename = child.name; + String filename = child.getName(); // https://github.com/dadoonet/fscrawler/issues/1 : Filter documents boolean isIndexable = isIndexable(filename, fsSettings.getFs().getIncludes(), fsSettings.getFs().getExcludes()); // It can happen that we a dir "foo" which does not match the include name like "*.txt" // We need to go in it unless it has been explicitly excluded by the user - if (child.directory && !isExcluded(filename, fsSettings.getFs().getExcludes())) { + if (child.isDirectory() && !isExcluded(filename, fsSettings.getFs().getExcludes())) { isIndexable = true; } logger.debug("[{}] can be indexed: [{}]", filename, isIndexable); if (isIndexable) { - if (child.file) { + if (child.isFile()) { logger.debug(" - file: {}", filename); fsFiles.add(filename); - if (child.lastModifiedDate.isAfter(lastScanDate) || - (child.creationDate != null && child.creationDate.isAfter(lastScanDate))) { + if (child.getLastModifiedDate().isAfter(lastScanDate) || + (child.getCreationDate() != null && child.getCreationDate().isAfter(lastScanDate))) { try { indexFile(child, stats, filepath, - fsSettings.getFs().isIndexContent() || fsSettings.getFs().isStoreSource() ? path.getInputStream(child) : null, child.size); + fsSettings.getFs().isIndexContent() || fsSettings.getFs().isStoreSource() ? path.getInputStream(child) : null, child.getSize()); stats.addFile(); } catch (java.io.FileNotFoundException e) { if (fsSettings.getFs().isContinueOnError()) { @@ -269,18 +269,18 @@ private void addFilesRecursively(FileAbstractor path, String filepath, LocalD } } else { logger.debug(" - not modified: creation date {} , file date {}, last scan date {}", - child.creationDate, child.lastModifiedDate, lastScanDate); + child.getCreationDate(), child.getLastModifiedDate(), lastScanDate); } - } else if (child.directory) { + } else if (child.isDirectory()) { logger.debug(" - folder: {}", filename); if (fsSettings.getFs().isIndexFolders()) { - fsFolders.add(child.fullpath); - indexDirectory(child.fullpath); + fsFolders.add(child.getFullpath()); + indexDirectory(child.getFullpath()); } - addFilesRecursively(path, child.fullpath, lastScanDate); + addFilesRecursively(path, child.getFullpath(), lastScanDate); } else { logger.debug(" - other: {}", filename); - logger.debug("Not a file nor a dir. Skipping {}", child.fullpath); + logger.debug("Not a file nor a dir. Skipping {}", child.getFullpath()); } } else { logger.debug(" - ignored file/dir: {}", filename); @@ -409,10 +409,10 @@ private Collection getFolderDirectory(String path) throws Exception { */ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, String dirname, InputStream inputStream, long filesize) throws Exception { - final String filename = fileAbstractModel.name; - final LocalDateTime lastmodified = fileAbstractModel.lastModifiedDate; - final String extension = fileAbstractModel.extension; - final long size = fileAbstractModel.size; + final String filename = fileAbstractModel.getName(); + final LocalDateTime lastmodified = fileAbstractModel.getLastModifiedDate(); + final String extension = fileAbstractModel.getExtension(); + final long size = fileAbstractModel.getSize(); logger.debug("fetching content from [{}],[{}]", dirname, filename); @@ -447,8 +447,11 @@ private void indexFile(FileAbstractModel fileAbstractModel, ScanStatistic stats, // Attributes if (fsSettings.getFs().isAttributesSupport()) { doc.setAttributes(new Attributes()); - doc.getAttributes().setOwner(fileAbstractModel.owner); - doc.getAttributes().setGroup(fileAbstractModel.group); + doc.getAttributes().setOwner(fileAbstractModel.getOwner()); + doc.getAttributes().setGroup(fileAbstractModel.getGroup()); + if (fileAbstractModel.getPermissions() >= 0) { + doc.getAttributes().setPermissions(fileAbstractModel.getPermissions()); + } } // Attributes diff --git a/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java b/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java index fbe0b605e..82d4194f3 100644 --- a/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java +++ b/crawler/crawler-abstract/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/FileAbstractModel.java @@ -42,17 +42,82 @@ import java.time.LocalDateTime; public class FileAbstractModel { - public String name; - public boolean file; - public boolean directory; - public LocalDateTime lastModifiedDate; - public LocalDateTime creationDate; - public String path; - public String fullpath; - public long size; - public String owner; - public String group; - public String extension; + private String name; + private boolean file; + private boolean directory; + private LocalDateTime lastModifiedDate; + private LocalDateTime creationDate; + private String path; + private String fullpath; + private long size; + private String owner; + private String group; + private int permissions; + private String extension; + + public FileAbstractModel(String name, boolean file, LocalDateTime lastModifiedDate, LocalDateTime creationDate, + String extension, String path, String fullpath, long size, String owner, String group, int permissions) { + this.name = name; + this.file = file; + this.directory = !file; + this.lastModifiedDate = lastModifiedDate; + this.creationDate = creationDate; + this.path = path; + this.fullpath = fullpath; + this.size = size; + this.owner = owner; + this.group = group; + this.permissions = permissions; + this.extension = extension; + } + + public String getName() { + return name; + } + + public boolean isFile() { + return file; + } + + public boolean isDirectory() { + return directory; + } + + public LocalDateTime getLastModifiedDate() { + return lastModifiedDate; + } + + public LocalDateTime getCreationDate() { + return creationDate; + } + + public String getPath() { + return path; + } + + public String getFullpath() { + return fullpath; + } + + public long getSize() { + return size; + } + + public String getOwner() { + return owner; + } + + public String getGroup() { + return group; + } + + public int getPermissions() { + return permissions; + } + + public String getExtension() { + return extension; + } @Override public String toString() { @@ -64,6 +129,7 @@ public String toString() { ", path='" + path + '\'' + ", owner='" + owner + '\'' + ", group='" + group + '\'' + + ", permissions=" + permissions + ", extension='" + extension + '\'' + ", fullpath='" + fullpath + '\'' + ", size=" + size + diff --git a/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java b/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java index 2352885d0..93c036e1b 100644 --- a/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java +++ b/crawler/crawler-fs/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/fs/FileAbstractorFile.java @@ -35,6 +35,7 @@ import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getCreationTime; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getFileExtension; +import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getFilePermissions; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getGroupName; import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getOwnerName; @@ -45,25 +46,23 @@ public FileAbstractorFile(FsSettings fsSettings) { @Override public FileAbstractModel toFileAbstractModel(String path, File file) { - FileAbstractModel model = new FileAbstractModel(); - model.name = file.getName(); - model.file = file.isFile(); - model.directory = !model.file; - model.lastModifiedDate = LocalDateTime.ofInstant(Instant.ofEpochMilli(file.lastModified()), ZoneId.systemDefault()); - model.creationDate = getCreationTime(file); - model.extension = getFileExtension(file); - model.path = path; - model.fullpath = file.getAbsolutePath(); - model.size = file.length(); - model.owner = getOwnerName(file); - model.group = getGroupName(file); - - return model; + return new FileAbstractModel( + file.getName(), + file.isFile(), + LocalDateTime.ofInstant(Instant.ofEpochMilli(file.lastModified()), ZoneId.systemDefault()), + getCreationTime(file), + getFileExtension(file), + path, + file.getAbsolutePath(), + file.length(), + getOwnerName(file), + getGroupName(file), + getFilePermissions(file)); } @Override public InputStream getInputStream(FileAbstractModel file) throws Exception { - return new FileInputStream(new File(file.fullpath)); + return new FileInputStream(new File(file.getFullpath())); } @Override @@ -95,12 +94,12 @@ public boolean exists(String dir) { } @Override - public void open() throws Exception { + public void open() { // Do nothing because we don't open resources in the File implementation. } @Override - public void close() throws Exception { + public void close() { // Do nothing because we don't open resources in the File implementation. } } diff --git a/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java b/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java index e25171cd6..424f985f9 100644 --- a/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java +++ b/crawler/crawler-ssh/src/main/java/fr/pilato/elasticsearch/crawler/fs/crawler/ssh/FileAbstractorSSH.java @@ -27,6 +27,7 @@ import fr.pilato.elasticsearch.crawler.fs.crawler.FileAbstractor; import fr.pilato.elasticsearch.crawler.fs.settings.FsSettings; import fr.pilato.elasticsearch.crawler.fs.settings.Server; +import org.apache.commons.io.FilenameUtils; import java.io.InputStream; import java.time.Instant; @@ -47,23 +48,25 @@ public FileAbstractorSSH(FsSettings fsSettings) { @Override public FileAbstractModel toFileAbstractModel(String path, ChannelSftp.LsEntry file) { - FileAbstractModel model = new FileAbstractModel(); - model.name = file.getFilename(); - model.directory = file.getAttrs().isDir(); - model.file = !model.directory; - // We are using here the local TimeZone as a reference. If the remote system is under another TZ, this might cause issues - model.lastModifiedDate = LocalDateTime.ofInstant(Instant.ofEpochMilli(file.getAttrs().getMTime()*1000L), ZoneId.systemDefault()); - model.path = path; - model.fullpath = model.path.concat("/").concat(model.name); - model.size = file.getAttrs().getSize(); - model.owner = Integer.toString(file.getAttrs().getUId()); - model.group = Integer.toString(file.getAttrs().getGId()); - return model; + return new FileAbstractModel( + file.getFilename(), + file.getAttrs().isDir(), + // We are using here the local TimeZone as a reference. If the remote system is under another TZ, this might cause issues + LocalDateTime.ofInstant(Instant.ofEpochMilli(file.getAttrs().getMTime()*1000L), ZoneId.systemDefault()), + // We don't have the creation date + null, + FilenameUtils.getExtension(file.getFilename()), + path, + path.concat("/").concat(file.getFilename()), + file.getAttrs().getSize(), + Integer.toString(file.getAttrs().getUId()), + Integer.toString(file.getAttrs().getGId()), + file.getAttrs().getPermissions()); } @Override public InputStream getInputStream(FileAbstractModel file) throws Exception { - return sftp.get(file.fullpath); + return sftp.get(file.getFullpath()); } @SuppressWarnings("unchecked") diff --git a/docs/source/admin/fs/elasticsearch.rst b/docs/source/admin/fs/elasticsearch.rst index 6190ac30d..1986ea7d9 100644 --- a/docs/source/admin/fs/elasticsearch.rst +++ b/docs/source/admin/fs/elasticsearch.rst @@ -560,129 +560,123 @@ It will prompt you for the password. Enter the certificate password like ``chang Generated fields ^^^^^^^^^^^^^^^^ -FSCrawler creates the following fields : - -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| Field | Description | Example | Javadoc | -+========================+======================+==============================================+=====================================================================+ -| ``content`` | Extracted content | ``"This is my text!"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``attachment`` | BASE64 encoded | BASE64 Encoded document | | -| | binary file | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.author`` | Author if any in | ``"David Pilato"`` | `CREATOR `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.title`` | Title if any in | ``"My document title"`` | `TITLE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.date`` | Last modified date | ``"2013-04-04T15:21:35"`` | `MODIFIED `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.keywords`` | Keywords if any in | ``["fs","elasticsearch"]`` | `KEYWORDS `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.language`` | Language (can be | ``"fr"`` | `LANGUAGE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.format`` | Format of the media | ``"application/pdf; version=1.6"`` | `FORMAT `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.identifier`` | URL/DOI/ISBN for | ``"FOOBAR"`` | `IDENTIFIER `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.contributor`` | Contributor | ``"foo bar"`` | `CONTRIBUTOR `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.coverage`` | Coverage | ``"FOOBAR"`` | `COVERAGE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.modifier`` | Last author | ``"David Pilato"`` | `MODIFIER `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.creator_tool`` | Tool used to create | ``"HTML2PDF- TCPDF"`` | `CREATOR_TOOL `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.publisher`` | Publisher: person, | ``"elastic"`` | `PUBLISHER `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.relation`` | Related resource | ``"FOOBAR"`` | `RELATION `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.rights`` | Information about | ``"CC-BY-ND"`` | `RIGHTS `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.source`` | Source for the | ``"FOOBAR"`` | `SOURCE `__ | -| | (derivated) | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.type`` | Nature or genre of | ``"Image"`` | `TYPE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.description`` | An account of the | ``"This is a description"`` | `DESCRIPTION `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.created`` | Date of creation | ``"2013-04-04T15:21:35"`` | `CREATED `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.print_date`` | When was the doc | ``"2013-04-04T15:21:35"`` | `PRINT_DATE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.metadata_date`` | Last modification of | ``"2013-04-04T15:21:35"`` | `METADATA_DATE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.latitude`` | The WGS84 Latitude | ``"N 48° 51' 45.81''"`` | `LATITUDE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.longitude`` | The WGS84 Longitude | ``"E 2° 17'15.331''"`` | `LONGITUDE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.altitude`` | The WGS84 Altitude | ``""`` | `ALTITUDE `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.rating`` | A user-assigned | ``0`` | `RATING `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.comments`` | Comments | ``"Comments"`` | `COMMENTS `__ | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``meta.raw`` | An object with all | ``"meta.raw.channels": "2"`` | | -| | raw metadata | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.content_type`` | Content Type | ``"application/vnd.oasis.opendocument.text"``| | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.last_modified`` | Last modification | ``1386855978000`` | | -| | date | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.indexing_date`` | Indexing date | ``"2013-12-12T13:50:58.758Z"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.filesize`` | File size in bytes | ``1256362`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.indexed_chars`` | Extracted chars if | ``100000`` | | -| | ``fs.indexed_chars`` | | | -| | > 0 | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.filename`` | Original file name | ``"mydocument.pdf"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.extension`` | Original file name | ``"pdf"`` | | -| | extension (from 2.2) | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.url`` | Original file url | ``"file://tmp/otherdir/mydocument.pdf"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``file.checksum`` | Checksum if | ``"c32eafae2587bef4b3b32f73743c3c61"`` | | -| | ``fs.checksum`` set | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``path.virtual`` | Relative path from | ``"/otherdir/mydocument.pdf"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``path.root`` | MD5 encoded parent | ``"112aed83738239dbfe4485f024cd4ce1"`` | | -| | path (internal use) | | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``path.real`` | Real path name | ``"/tmp/otherdir/mydocument.pdf"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``attributes.owner`` | Owner name | ``"david"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ -| ``attributes.group`` | Group name | ``"staff"`` | | -+------------------------+----------------------+----------------------------------------------+---------------------------------------------------------------------+ +FSCrawler may create the following fields depending on configuration and available data: + ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| Field | Description | Example | Javadoc | ++============================+========================================+==============================================+=====================================================================+ +| ``content`` | Extracted content | ``"This is my text!"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``attachment`` | BASE64 encoded binary file | BASE64 Encoded document | | +| | | | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.author`` | Author if any in | ``"David Pilato"`` | `CREATOR `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.title`` | Title if any in document metadata | ``"My document title"`` | `TITLE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.date`` | Last modified date | ``"2013-04-04T15:21:35"`` | `MODIFIED `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.keywords`` | Keywords if any in document metadata | ``["fs","elasticsearch"]`` | `KEYWORDS `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.language`` | Language (can be detected) | ``"fr"`` | `LANGUAGE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.format`` | Format of the media | ``"application/pdf; version=1.6"`` | `FORMAT `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.identifier`` | URL/DOI/ISBN for example | ``"FOOBAR"`` | `IDENTIFIER `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.contributor`` | Contributor | ``"foo bar"`` | `CONTRIBUTOR `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.coverage`` | Coverage | ``"FOOBAR"`` | `COVERAGE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.modifier`` | Last author | ``"David Pilato"`` | `MODIFIER `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.creator_tool`` | Tool used to create the resource | ``"HTML2PDF- TCPDF"`` | `CREATOR_TOOL `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.publisher`` | Publisher: person, organisation, | ``"elastic"`` | `PUBLISHER `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.relation`` | Related resource | ``"FOOBAR"`` | `RELATION `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.rights`` | Information about rights | ``"CC-BY-ND"`` | `RIGHTS `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.source`` | Source for the current document | ``"FOOBAR"`` | `SOURCE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.type`` | Nature or genre of the content | ``"Image"`` | `TYPE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.description`` | An account of the content | ``"This is a description"`` | `DESCRIPTION `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.created`` | Date of creation | ``"2013-04-04T15:21:35"`` | `CREATED `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.print_date`` | When was the doc last printed? | ``"2013-04-04T15:21:35"`` | `PRINT_DATE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.metadata_date`` | Last modification of metadata | ``"2013-04-04T15:21:35"`` | `METADATA_DATE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.latitude`` | The WGS84 Latitude of the Point | ``"N 48° 51' 45.81''"`` | `LATITUDE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.longitude`` | The WGS84 Longitude of the Point | ``"E 2° 17'15.331''"`` | `LONGITUDE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.altitude`` | The WGS84 Altitude of the Point | ``""`` | `ALTITUDE `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.rating`` | A user-assigned rating -1, [0..5] | ``0`` | `RATING `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.comments`` | Comments | ``"Comments"`` | `COMMENTS `__ | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``meta.raw`` | An object with all raw metadata | ``"meta.raw.channels": "2"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.content_type`` | Content Type | ``"application/vnd.oasis.opendocument.text"``| | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.last_modified`` | Last modification date | ``1386855978000`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.indexing_date`` | Indexing date | ``"2013-12-12T13:50:58.758Z"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.filesize`` | File size in bytes | ``1256362`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.indexed_chars`` | Extracted chars | ``100000`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.filename`` | Original file name | ``"mydocument.pdf"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.extension`` | Original file name extension | ``"pdf"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.url`` | Original file url | ``"file://tmp/otherdir/mydocument.pdf"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``file.checksum`` | Checksum | ``"c32eafae2587bef4b3b32f73743c3c61"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``path.virtual`` | Relative path from | ``"/otherdir/mydocument.pdf"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``path.root`` | MD5 encoded parent path (internal use) | ``"112aed83738239dbfe4485f024cd4ce1"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``path.real`` | Real path name | ``"/tmp/otherdir/mydocument.pdf"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``attributes.owner`` | Owner name | ``"david"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``attributes.group`` | Group name | ``"staff"`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ +| ``attributes.permissions`` | Permissions | ``764`` | | ++----------------------------+----------------------------------------+----------------------------------------------+---------------------------------------------------------------------+ For more information about meta data, please read the `TikaCoreProperties `__. @@ -690,7 +684,7 @@ Here is a typical JSON document generated by the crawler: .. code:: json - { + { "file":{ "filename":"test.odt", "extension":"odt", @@ -709,7 +703,8 @@ Here is a typical JSON document generated by the crawler: }, "attributes": { "owner": "david", - "group": "staff" + "group": "staff", + "permissions": 764 }, "meta":{ "author":"David Pilato", @@ -723,7 +718,7 @@ Here is a typical JSON document generated by the crawler: "language":"fr" }, "content":"Bonjour David\n\n\n" - } + } .. _search-examples: diff --git a/docs/source/admin/fs/local-fs.rst b/docs/source/admin/fs/local-fs.rst index 934b1c479..494d4c69b 100644 --- a/docs/source/admin/fs/local-fs.rst +++ b/docs/source/admin/fs/local-fs.rst @@ -354,8 +354,8 @@ You can force to use the ``_id`` to be the filename using Adding file attributes ^^^^^^^^^^^^^^^^^^^^^^ -If you want to add file attributes such as ``attributes.owner`` and -``attributes.group``, you can set ``attributes_support`` to ``true``. +If you want to add file attributes such as ``attributes.owner``, ``attributes.group`` +and ``attributes.permissions``, you can set ``attributes_support`` to ``true``. .. code:: json @@ -366,6 +366,11 @@ If you want to add file attributes such as ``attributes.owner`` and } } +.. note:: + + On Windows systems, ``attributes.group`` and ``attributes.permissions`` are + not generated. + Disabling raw metadata ^^^^^^^^^^^^^^^^^^^^^^ diff --git a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java index 6a1cb2a7d..d0b54cb14 100644 --- a/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java +++ b/framework/src/main/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtil.java @@ -45,6 +45,7 @@ import java.nio.file.attribute.FileOwnerAttributeView; import java.nio.file.attribute.PosixFileAttributeView; import java.nio.file.attribute.PosixFileAttributes; +import java.nio.file.attribute.PosixFilePermission; import java.time.LocalDateTime; import java.time.ZoneId; import java.time.format.DateTimeFormatter; @@ -54,6 +55,7 @@ import java.util.List; import java.util.Map; import java.util.Properties; +import java.util.Set; import java.util.TimeZone; public class FsCrawlerUtil { @@ -265,6 +267,43 @@ public static String getGroupName(final File file) { } } + /** + * Determines file permissions. + */ + public static int getFilePermissions(final File file) { + if (OsValidator.WINDOWS) { + logger.trace("Determining 'group' is skipped for file [{}] on [{}]", file, OsValidator.OS); + return -1; + } + try { + final Path path = Paths.get(file.getAbsolutePath()); + PosixFileAttributes attrs = Files.getFileAttributeView(path, PosixFileAttributeView.class).readAttributes(); + Set permissions = attrs.permissions(); + int user = toOctalPermission( + permissions.contains(PosixFilePermission.OWNER_READ), + permissions.contains(PosixFilePermission.OWNER_WRITE), + permissions.contains(PosixFilePermission.OWNER_EXECUTE)); + int group = toOctalPermission( + permissions.contains(PosixFilePermission.GROUP_READ), + permissions.contains(PosixFilePermission.GROUP_WRITE), + permissions.contains(PosixFilePermission.GROUP_EXECUTE)); + int others = toOctalPermission( + permissions.contains(PosixFilePermission.OTHERS_READ), + permissions.contains(PosixFilePermission.OTHERS_WRITE), + permissions.contains(PosixFilePermission.OTHERS_EXECUTE)); + + return user * 100 + group * 10 + others; + } + catch(Exception e) { + logger.warn("Failed to determine 'owner' of {}: {}", file, e.getMessage()); + return -1; + } + } + + private static int toOctalPermission(boolean read, boolean write, boolean execute) { + return (read ? 4 : 0) + (write ? 2 : 0) + (execute ? 1 : 0); + } + private static final String CLASSPATH_RESOURCES_ROOT = "/fr/pilato/elasticsearch/crawler/fs/_default/"; public static final String[] MAPPING_RESOURCES = { "2/_settings.json", "2/_settings_folder.json", diff --git a/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java b/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java new file mode 100644 index 000000000..051939679 --- /dev/null +++ b/framework/src/test/java/fr/pilato/elasticsearch/crawler/fs/framework/FsCrawlerUtilTest.java @@ -0,0 +1,76 @@ +/* + * Licensed to Elasticsearch under one or more contributor + * license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright + * ownership. Elasticsearch licenses this file to you under + * the Apache License, Version 2.0 (the "License"); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package fr.pilato.elasticsearch.crawler.fs.framework; + +import fr.pilato.elasticsearch.crawler.fs.test.framework.AbstractFSCrawlerTestCase; +import org.junit.BeforeClass; +import org.junit.Test; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.attribute.FileAttribute; +import java.nio.file.attribute.PosixFilePermission; +import java.nio.file.attribute.PosixFilePermissions; +import java.util.Set; + +import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getFilePermissions; +import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getGroupName; +import static fr.pilato.elasticsearch.crawler.fs.framework.FsCrawlerUtil.getOwnerName; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.is; +import static org.hamcrest.Matchers.isEmptyOrNullString; +import static org.hamcrest.Matchers.not; +import static org.junit.Assume.assumeFalse; + +public class FsCrawlerUtilTest extends AbstractFSCrawlerTestCase { + + private static File file; + + @BeforeClass + public static void createTmpFile() throws IOException { + Path path = rootTmpDir.resolve("test-group.txt"); + Set permissions = PosixFilePermissions.fromString("rwx------"); + FileAttribute> fileAttributes = PosixFilePermissions.asFileAttribute(permissions); + Files.createFile(path, fileAttributes); + file = path.toFile(); + } + + @Test + public void testOwnerName() { + String ownerName = getOwnerName(file); + assertThat(ownerName, not(isEmptyOrNullString())); + } + + @Test + public void testGroups() { + assumeFalse("This test can not run on Windows.", OsValidator.WINDOWS); + String groupName = getGroupName(file); + assertThat(groupName, not(isEmptyOrNullString())); + } + + @Test + public void testPermissions() { + assumeFalse("This test can not run on Windows.", OsValidator.WINDOWS); + int permissions = getFilePermissions(file); + assertThat(permissions, is(700)); + } +} diff --git a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestAttributesIT.java b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestAttributesIT.java index 0ec60e763..ccc08cd33 100644 --- a/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestAttributesIT.java +++ b/integration-tests/src/test/java/fr/pilato/elasticsearch/crawler/fs/test/integration/FsCrawlerTestAttributesIT.java @@ -21,6 +21,7 @@ import fr.pilato.elasticsearch.crawler.fs.beans.Attributes; import fr.pilato.elasticsearch.crawler.fs.beans.Doc; +import fr.pilato.elasticsearch.crawler.fs.framework.OsValidator; import fr.pilato.elasticsearch.crawler.fs.settings.Fs; import org.elasticsearch.action.search.SearchRequest; import org.elasticsearch.action.search.SearchResponse; @@ -29,6 +30,9 @@ import static fr.pilato.elasticsearch.crawler.fs.framework.JsonUtil.extractFromPath; import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.greaterThanOrEqualTo; +import static org.hamcrest.Matchers.hasKey; +import static org.hamcrest.Matchers.not; import static org.hamcrest.Matchers.notNullValue; /** @@ -44,6 +48,17 @@ public void test_attributes() throws Exception { SearchResponse searchResponse = countTestHelper(new SearchRequest(getCrawlerName()), 1L, null); for (SearchHit hit : searchResponse.getHits().getHits()) { assertThat(extractFromPath(hit.getSourceAsMap(), Doc.FIELD_NAMES.ATTRIBUTES).get(Attributes.FIELD_NAMES.OWNER), notNullValue()); + if (OsValidator.WINDOWS) { + // We should not have values for group and permissions on Windows OS + assertThat(extractFromPath(hit.getSourceAsMap(), Doc.FIELD_NAMES.ATTRIBUTES), not(hasKey(Attributes.FIELD_NAMES.GROUP))); + assertThat(extractFromPath(hit.getSourceAsMap(), Doc.FIELD_NAMES.ATTRIBUTES), not(hasKey(Attributes.FIELD_NAMES.PERMISSIONS))); + } else { + // We test group and permissions only on non Windows OS + assertThat(extractFromPath(hit.getSourceAsMap(), Doc.FIELD_NAMES.ATTRIBUTES).get(Attributes.FIELD_NAMES.GROUP), notNullValue()); + Object permissions = extractFromPath(hit.getSourceAsMap(), Doc.FIELD_NAMES.ATTRIBUTES).get(Attributes.FIELD_NAMES.PERMISSIONS); + assertThat(permissions, notNullValue()); + assertThat((int) permissions, greaterThanOrEqualTo(400)); + } } } }