Skip to content

Commit

Permalink
Merge pull request teragrep#4 from StrongestNumber9/refactor_delimiters
Browse files Browse the repository at this point in the history
Refactor delimiters
  • Loading branch information
kortemik authored Oct 10, 2023
2 parents 85e9e1a + 6c6a4df commit 0ee2979
Show file tree
Hide file tree
Showing 6 changed files with 83 additions and 87 deletions.
7 changes: 7 additions & 0 deletions src/main/java/com/teragrep/blf_01/Delimiter.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@

import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.util.HashMap;
import java.util.Objects;

public class Delimiter {
Expand Down Expand Up @@ -80,4 +81,10 @@ public boolean equals(Object o) {
public int hashCode() {
return Objects.hash(delimiterBuffer);
}

public HashMap<ByteBuffer, Delimiter> asMap() {
HashMap<ByteBuffer, Delimiter> map = new HashMap<>();
map.put(delimiterBuffer, this);
return map;
}
}
5 changes: 3 additions & 2 deletions src/main/java/com/teragrep/blf_01/Delimiters.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@

package com.teragrep.blf_01;

import java.util.ArrayList;
import java.nio.ByteBuffer;
import java.util.HashMap;

public interface Delimiters {
ArrayList<Delimiter> getDelimiters();
HashMap<ByteBuffer, Delimiter> getDelimiters();
}
89 changes: 45 additions & 44 deletions src/main/java/com/teragrep/blf_01/MajorDelimiters.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,58 +46,59 @@

package com.teragrep.blf_01;

import java.util.*;
import java.nio.ByteBuffer;
import java.util.HashMap;

public class MajorDelimiters implements Delimiters {

private final ArrayList<Delimiter> delimiterSet;
private final HashMap<ByteBuffer, Delimiter> delimiterSet;
MajorDelimiters() {
this.delimiterSet = new ArrayList<>();
this.delimiterSet = new HashMap<>();

delimiterSet.add(new Delimiter("\t"));
delimiterSet.add(new Delimiter("\n"));
delimiterSet.add(new Delimiter("\r"));
delimiterSet.add(new Delimiter(" "));
delimiterSet.add(new Delimiter("!"));
delimiterSet.add(new Delimiter("\""));
delimiterSet.add(new Delimiter("%0A"));
delimiterSet.add(new Delimiter("%20"));
delimiterSet.add(new Delimiter("%21"));
delimiterSet.add(new Delimiter("%2520"));
delimiterSet.add(new Delimiter("%2526"));
delimiterSet.add(new Delimiter("%26"));
delimiterSet.add(new Delimiter("%28"));
delimiterSet.add(new Delimiter("%29"));
delimiterSet.add(new Delimiter("%2B"));
delimiterSet.add(new Delimiter("%2C"));
delimiterSet.add(new Delimiter("%3A"));
delimiterSet.add(new Delimiter("%3B"));
delimiterSet.add(new Delimiter("%3D"));
delimiterSet.add(new Delimiter("%5B"));
delimiterSet.add(new Delimiter("%5D"));
delimiterSet.add(new Delimiter("%7C"));
delimiterSet.add(new Delimiter("&"));
delimiterSet.add(new Delimiter("'"));
delimiterSet.add(new Delimiter("|"));
delimiterSet.add(new Delimiter("("));
delimiterSet.add(new Delimiter(")"));
delimiterSet.add(new Delimiter("*"));
delimiterSet.add(new Delimiter("+"));
delimiterSet.add(new Delimiter(","));
delimiterSet.add(new Delimiter("--"));
delimiterSet.add(new Delimiter(";"));
delimiterSet.add(new Delimiter("<"));
delimiterSet.add(new Delimiter(">"));
delimiterSet.add(new Delimiter("?"));
delimiterSet.add(new Delimiter("["));
delimiterSet.add(new Delimiter("]"));
delimiterSet.add(new Delimiter("{"));
delimiterSet.add(new Delimiter("\\"));
delimiterSet.add(new Delimiter("}"));
delimiterSet.putAll(new Delimiter("\t").asMap());
delimiterSet.putAll(new Delimiter("\n").asMap());
delimiterSet.putAll(new Delimiter("\r").asMap());
delimiterSet.putAll(new Delimiter(" ").asMap());
delimiterSet.putAll(new Delimiter("!").asMap());
delimiterSet.putAll(new Delimiter("\"").asMap());
delimiterSet.putAll(new Delimiter("%0A").asMap());
delimiterSet.putAll(new Delimiter("%20").asMap());
delimiterSet.putAll(new Delimiter("%21").asMap());
delimiterSet.putAll(new Delimiter("%2520").asMap());
delimiterSet.putAll(new Delimiter("%2526").asMap());
delimiterSet.putAll(new Delimiter("%26").asMap());
delimiterSet.putAll(new Delimiter("%28").asMap());
delimiterSet.putAll(new Delimiter("%29").asMap());
delimiterSet.putAll(new Delimiter("%2B").asMap());
delimiterSet.putAll(new Delimiter("%2C").asMap());
delimiterSet.putAll(new Delimiter("%3A").asMap());
delimiterSet.putAll(new Delimiter("%3B").asMap());
delimiterSet.putAll(new Delimiter("%3D").asMap());
delimiterSet.putAll(new Delimiter("%5B").asMap());
delimiterSet.putAll(new Delimiter("%5D").asMap());
delimiterSet.putAll(new Delimiter("%7C").asMap());
delimiterSet.putAll(new Delimiter("&").asMap());
delimiterSet.putAll(new Delimiter("'").asMap());
delimiterSet.putAll(new Delimiter("|").asMap());
delimiterSet.putAll(new Delimiter("(").asMap());
delimiterSet.putAll(new Delimiter(")").asMap());
delimiterSet.putAll(new Delimiter("*").asMap());
delimiterSet.putAll(new Delimiter("+").asMap());
delimiterSet.putAll(new Delimiter(",").asMap());
delimiterSet.putAll(new Delimiter("--").asMap());
delimiterSet.putAll(new Delimiter(";").asMap());
delimiterSet.putAll(new Delimiter("<").asMap());
delimiterSet.putAll(new Delimiter(">").asMap());
delimiterSet.putAll(new Delimiter("?").asMap());
delimiterSet.putAll(new Delimiter("[").asMap());
delimiterSet.putAll(new Delimiter("]").asMap());
delimiterSet.putAll(new Delimiter("{").asMap());
delimiterSet.putAll(new Delimiter("\\").asMap());
delimiterSet.putAll(new Delimiter("}").asMap());
}

@Override
public ArrayList<Delimiter> getDelimiters() {
public HashMap<ByteBuffer, Delimiter> getDelimiters() {
return delimiterSet;
}
}
31 changes: 16 additions & 15 deletions src/main/java/com/teragrep/blf_01/MinorDelimiters.java
Original file line number Diff line number Diff line change
Expand Up @@ -46,31 +46,32 @@

package com.teragrep.blf_01;

import java.util.ArrayList;
import java.nio.ByteBuffer;
import java.util.HashMap;

public class MinorDelimiters implements Delimiters {

private final ArrayList<Delimiter> delimiterSet;
private final HashMap<ByteBuffer, Delimiter> delimiterSet;

MinorDelimiters() {
this.delimiterSet = new ArrayList<>();
this.delimiterSet = new HashMap<>();

delimiterSet.add(new Delimiter("#"));
delimiterSet.add(new Delimiter("$"));
delimiterSet.add(new Delimiter("%"));
delimiterSet.add(new Delimiter("-"));
delimiterSet.add(new Delimiter("."));
delimiterSet.add(new Delimiter("/"));
delimiterSet.add(new Delimiter(":"));
delimiterSet.add(new Delimiter("="));
delimiterSet.add(new Delimiter("@"));
delimiterSet.add(new Delimiter("\\"));
delimiterSet.add(new Delimiter("_"));
delimiterSet.putAll(new Delimiter("#").asMap());
delimiterSet.putAll(new Delimiter("$").asMap());
delimiterSet.putAll(new Delimiter("%").asMap());
delimiterSet.putAll(new Delimiter("-").asMap());
delimiterSet.putAll(new Delimiter(".").asMap());
delimiterSet.putAll(new Delimiter("/").asMap());
delimiterSet.putAll(new Delimiter(":").asMap());
delimiterSet.putAll(new Delimiter("=").asMap());
delimiterSet.putAll(new Delimiter("@").asMap());
delimiterSet.putAll(new Delimiter("\\").asMap());
delimiterSet.putAll(new Delimiter("_").asMap());

}

@Override
public ArrayList<Delimiter> getDelimiters() {
public HashMap<ByteBuffer, Delimiter> getDelimiters() {
return delimiterSet;
}
}
28 changes: 7 additions & 21 deletions src/main/java/com/teragrep/blf_01/TokenScan.java
Original file line number Diff line number Diff line change
Expand Up @@ -54,19 +54,21 @@ public class TokenScan {
public final Delimiters delimiters;

private final ByteBuffer windowBuffer;
private final Delimiter stubDelimiter;

TokenScan(Delimiters delimiters) {
this.delimiters = delimiters;

// create windowBuffer with size of longest delimiter

int size = Integer.MIN_VALUE;
for (Delimiter delimiter : delimiters.getDelimiters()) {
if (delimiter.delimiterBuffer.capacity() > size) {
for(Delimiter delimiter : delimiters.getDelimiters().values()) {
if(delimiter.delimiterBuffer.capacity() > size) {
size = delimiter.delimiterBuffer.capacity();
}
}

this.windowBuffer = ByteBuffer.allocateDirect(size);
this.stubDelimiter = new Delimiter();
}

public ArrayList<Token> findBy(Stream stream) {
Expand Down Expand Up @@ -168,34 +170,18 @@ private ByteBuffer extendBuffer(ByteBuffer byteBuffer, int size) {
// -----

Delimiter match(Delimiters delimiters, ByteBuffer matchBuffer) {
Delimiter rv = getOrCreateDelimiter(delimiters, matchBuffer);

Delimiter rv = delimiters.getDelimiters().getOrDefault(matchBuffer, this.stubDelimiter);
if (rv.isStub) {
// saerch smaller delimiter
// TODO get delimiter by size
if (matchBuffer.limit() > 1) {
ByteBuffer sliceBuffer = matchBuffer.slice();
ByteBuffer subMatchBuffer = (ByteBuffer) sliceBuffer.limit(matchBuffer.limit() - 1);
Delimiter sub = match(delimiters, subMatchBuffer);
if (!sub.isStub) {
if (sub.delimiterBuffer.capacity() > rv.delimiterBuffer.capacity()) {
rv = sub;
}
}
rv = match(delimiters, subMatchBuffer);
}
}

return rv;
}

Delimiter getOrCreateDelimiter(Delimiters delimiters, ByteBuffer matchBuffer) {
for (Delimiter delimiter : delimiters.getDelimiters()) {
if (matchBuffer.equals(delimiter.delimiterBuffer)) {
return delimiter;
}
}
return new Delimiter();
}
}


10 changes: 5 additions & 5 deletions src/test/java/com/teragrep/blf_01/PerformanceTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -69,13 +69,13 @@ public void testAll() throws FileNotFoundException {
ArrayList<Token> majorTokens = majorTokenScan.findBy(stream);

ArrayList<Token> allTokens = new ArrayList<>(majorTokens);

Delimiters minorDelimiters = new MinorDelimiters();
for (Token token : majorTokens) {
ByteArrayInputStream tokenBais = new ByteArrayInputStream(token.bytes);

Stream tokenStream = new Stream(tokenBais);

TokenScan minorTokenScan = new TokenScan(new MinorDelimiters());
TokenScan minorTokenScan = new TokenScan(minorDelimiters);

ArrayList<Token> minorTokens = minorTokenScan.findBy(tokenStream);

Expand All @@ -101,13 +101,13 @@ public void testAllBig() throws FileNotFoundException {
ArrayList<Token> majorTokens = majorTokenScan.findBy(stream);

ArrayList<Token> allTokens = new ArrayList<>(majorTokens);

Delimiters minorDelimiters = new MinorDelimiters();
for (Token token : majorTokens) {
ByteArrayInputStream tokenBais = new ByteArrayInputStream(token.bytes);

Stream tokenStream = new Stream(tokenBais);

TokenScan minorTokenScan = new TokenScan(new MinorDelimiters());
TokenScan minorTokenScan = new TokenScan(minorDelimiters);

ArrayList<Token> minorTokens = minorTokenScan.findBy(tokenStream);

Expand Down Expand Up @@ -149,7 +149,7 @@ public void testSmall() {
allTokens.addAll(tokenized);
}
Instant end = Instant.now();
float duration = (float) ChronoUnit.MILLIS.between(start, end)/1000;
float duration = (float) ChronoUnit.MICROS.between(start, end)/1_000_000;
System.out.println("Time taken: " + duration + " seconds");
System.out.println("Tokens: " + allTokens.size() + " (" + allTokens.size()/duration + "/s)");
}
Expand Down

0 comments on commit 0ee2979

Please sign in to comment.