From 408a3015dc578f2598c14645b942f04c9042d7ce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Pacheco=20Neves?= Date: Thu, 8 Apr 2021 15:48:20 +0100 Subject: [PATCH] =?UTF-8?q?Add=20`String`=20multi-replace=20via=20`Scanner?= =?UTF-8?q?`=20=E2=8F=A9=20(#227)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sometimes we need to replace multiple characters in a given `String`, but both `replacingOccurrences()` and `replacingCharacters` operate on a single `String`/`Character`, requiring multiple passes and thus becoming very inefficient. One such example is to replace all line breaking characters in a string into their non line breaking version, which requires 6 substitution "passes" (space, hyphen, em dash, en dash, question mark and closing brace). By using a `Scanner` as a matching mechanism, we can implement multi-replace in a single pass on the string, greatly improving efficiency. ## Changes - Add new `String.replacingOccurrencesOfCharacters(in:skippingCharactersIn:)` extension to allow replacing multiple characters in a string in a single pass. - Add new `String.nonLineBreaking(newlineCharacterReplacement:)` extension to convert a string into a non line breaking version and allow tweaking the newline replacement behavior. - Create new `Character.newlines` helper to contain all `Character`s in `CharacterSet.newlines`. --- Alicerce.xcodeproj/project.pbxproj | 4 + Sources/Extensions/Foundation/Character.swift | 14 ++ Sources/Extensions/Foundation/String.swift | 97 ++++++++++++ .../Foundation/StringTestCase.swift | 149 +++++++++++++++++- 4 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 Sources/Extensions/Foundation/Character.swift diff --git a/Alicerce.xcodeproj/project.pbxproj b/Alicerce.xcodeproj/project.pbxproj index 9feb1e77..f46d3320 100644 --- a/Alicerce.xcodeproj/project.pbxproj +++ b/Alicerce.xcodeproj/project.pbxproj @@ -161,6 +161,7 @@ 0A77982920FCCD24008E269A /* RetryTestCase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A77982820FCCD24008E269A /* RetryTestCase.swift */; }; 0A77982F20FFF29D008E269A /* Retry.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A77982E20FFF29D008E269A /* Retry.swift */; }; 0A79686120812130005738AF /* LockTestCase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0ACEB2992080F0E5000D95AD /* LockTestCase.swift */; }; + 0A7ACC852527467B00AA2213 /* Character.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A7ACC842527467B00AA2213 /* Character.swift */; }; 0A7B504D20B632FA005A08E7 /* *.alicerce.mindera.com.pem in Resources */ = {isa = PBXBuildFile; fileRef = 0A7B504C20B632FA005A08E7 /* *.alicerce.mindera.com.pem */; }; 0A7B505020B6D346005A08E7 /* SecCertificate+PublicKey.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A7B504E20B6D2C4005A08E7 /* SecCertificate+PublicKey.swift */; }; 0A7B505220B6D769005A08E7 /* SecCertificate+PublicKeyTestCase.swift in Sources */ = {isa = PBXBuildFile; fileRef = 0A7B505120B6D769005A08E7 /* SecCertificate+PublicKeyTestCase.swift */; }; @@ -491,6 +492,7 @@ 0A76A004209F854C00D46B63 /* Route+TrieNode_IsEmptyAndDescriptionTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "Route+TrieNode_IsEmptyAndDescriptionTests.swift"; sourceTree = ""; }; 0A77982820FCCD24008E269A /* RetryTestCase.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = RetryTestCase.swift; sourceTree = ""; }; 0A77982E20FFF29D008E269A /* Retry.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Retry.swift; sourceTree = ""; }; + 0A7ACC842527467B00AA2213 /* Character.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = Character.swift; sourceTree = ""; }; 0A7B504C20B632FA005A08E7 /* *.alicerce.mindera.com.pem */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; path = "*.alicerce.mindera.com.pem"; sourceTree = ""; }; 0A7B504E20B6D2C4005A08E7 /* SecCertificate+PublicKey.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "SecCertificate+PublicKey.swift"; sourceTree = ""; }; 0A7B505120B6D769005A08E7 /* SecCertificate+PublicKeyTestCase.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = "SecCertificate+PublicKeyTestCase.swift"; sourceTree = ""; }; @@ -806,6 +808,7 @@ 0A3C2C8E1EA7E18500EFB7D4 /* String.swift */, 0A3C2C8F1EA7E18500EFB7D4 /* Thread.swift */, 1B4D4CB61F05016B00FA4260 /* URLRequest.swift */, + 0A7ACC842527467B00AA2213 /* Character.swift */, ); path = Foundation; sourceTree = ""; @@ -2011,6 +2014,7 @@ 0A3C2DB71EA7E5DD00EFB7D4 /* CollectionReusableView.swift in Sources */, 9D4E3AA1239A6557007F3050 /* CollectionReusableViewSizer.swift in Sources */, 4838FE3123A94CE0007311F0 /* Array+ConstrainableProxy.swift in Sources */, + 0A7ACC852527467B00AA2213 /* Character.swift in Sources */, 0A266F201ED374F5009CD0D7 /* AssertDumpsEqual.swift in Sources */, 0ACEB2922080E6D4000D95AD /* Atomic.swift in Sources */, 0A83885E1EB1F6B000C1E835 /* NSPersistentStoreCoordinator+CoreDataStack.swift in Sources */, diff --git a/Sources/Extensions/Foundation/Character.swift b/Sources/Extensions/Foundation/Character.swift new file mode 100644 index 00000000..e24e0388 --- /dev/null +++ b/Sources/Extensions/Foundation/Character.swift @@ -0,0 +1,14 @@ +import Foundation + +extension Character { + + static let lineSeparator: Character = "\u{2028}" + static let nonBreakingSpace: Character = "\u{00a0}" + static let nonBreakingHyphen: Character = "\u{2011}" + static let wordJoiner: Character = "\u{2060}" + static let emDash: Character = "\u{2013}" // — + static let enDash: Character = "\u{2014}" // – + + // from `CharacterSet.newlines` + static let newlines: [Character] = ["\u{A}", "\u{B}", "\u{C}", "\u{D}", "\u{85}", "\u{2028}", "\u{2029}"] +} diff --git a/Sources/Extensions/Foundation/String.swift b/Sources/Extensions/Foundation/String.swift index 3656e32c..33d435fc 100644 --- a/Sources/Extensions/Foundation/String.swift +++ b/Sources/Extensions/Foundation/String.swift @@ -59,3 +59,100 @@ public extension String { dump(x, to: &self) } } + +extension String { + + /// Replaces occurrences of multiple `Character`s with corresponding `String` values using the given mapping, while + /// skipping (filtering out) an optional set of characters from the output. Being backed by a `Scanner`, a single + /// pass is made over the receiver. + /// + /// - Parameters: + /// - replacementMap: A dictionary containing the replacement mapping `Character` -> `String`. + /// - charactersToBeSkipped: An optional set of characters to skip (i.e. filter out from the input). + /// - Returns: A modified version of the receiver with the replacement mapping applied. + public func replacingOccurrencesOfCharacters( + in replacementMap: [Character: String], + skippingCharactersIn charactersToBeSkipped: CharacterSet? = nil + ) -> String { + + guard !replacementMap.isEmpty else { return self } + + let matchSet = CharacterSet(charactersIn: replacementMap.keys.reduce(into: "") { $0 += String($1) }) + .union(charactersToBeSkipped ?? CharacterSet()) + + var final = "" + + let scanner = Scanner(string: self) + scanner.charactersToBeSkipped = charactersToBeSkipped + + while !scanner.isAtEnd { + + // copy everything until finding a character to be replaced or skipped + var collector: NSString? = "" + if scanner.scanUpToCharacters(from: matchSet, into: &collector), let collector = collector { + final.append(collector as String) + } + + // exit early if we're already at the end + guard !scanner.isAtEnd else { break } + + // find and replace matching character if needed + replacementMap + .first { match, _ in scanner.scanString(String(match), into: nil) } + .flatMap { _, replacement in final.append(replacement) } + } + + return final + } +} + +extension String { + + public static let nonBreakingSpace = String(Character.nonBreakingSpace) + public static let nonBreakingHyphen = String(Character.nonBreakingHyphen) + public static let wordJoiner = String(Character.wordJoiner) + public static let emDash = String(Character.emDash) + public static let enDash = String(Character.enDash) + + /// Returns a non line breaking version of `self`. Line breaking characters occurrences are replaced with + /// corresponding non line breaking variants when existent. Otherwise, word joiner characters are attached to them + /// to make them non line breaking. Existing newlines can be replaced by any given string, via the optional + /// `newlineCharacterReplacement` parameter (defaults to `nil`, which preserves newlines). + /// + /// The character mapping is: + /// - space (" ") -> non breaking space (`U+2028`) + /// - hyphen ("-") -> non breaking hyphen (`U+00A0`) + /// - em dash ("—") -> word joiner (`U+2060`) + em dash + word joiner (`U+2060`) + /// - en dash ("–") -> word joiner (`U+2060`) + en dash + word joiner (`U+2060`) + /// - question mark ("?") -> question mark + word joiner (`U+2060`) + /// - closing brace ("}") -> closing brace + word joiner (`U+2060`) + /// + /// The `newlineCharacterReplacement` acts upon the characters specified in `CharacterSet.newlines` + /// (`U+000A ~ U+000D`, `U+0085`, `U+2028`, and `U+2029`), some example values are: + /// - `nil` -> newlines are preserved + /// - `""` -> newlines are stripped + /// - `String.nonBreakingSpace` -> output a single line + /// + /// - Parameter newlineCharacterReplacement: The replacement string to use for newline characters (defaults to + /// `nil`). + /// - Returns: A modified version of the receiver without line breaking characters. + public func nonLineBreaking(replacingNewlinesWith newlineCharacterReplacement: String? = nil) -> String { + + let newlineReplacementMap = newlineCharacterReplacement + .flatMap { replacement in Dictionary(uniqueKeysWithValues: Character.newlines.map { ($0, replacement) }) } + ?? [:] + + return replacingOccurrencesOfCharacters( + in: [ + " ": String.nonBreakingSpace, + "-": String.nonBreakingHyphen, + .emDash: String([.wordJoiner, .emDash, .wordJoiner]), + .enDash: String([.wordJoiner, .enDash, .wordJoiner]), + "?": "?" + .wordJoiner, + "}": "}" + .wordJoiner + ] + .merging(newlineReplacementMap) { $1 }, + skippingCharactersIn: nil + ) + } +} diff --git a/Tests/AlicerceTests/Extensions/Foundation/StringTestCase.swift b/Tests/AlicerceTests/Extensions/Foundation/StringTestCase.swift index 072ad41d..9f62c0f6 100644 --- a/Tests/AlicerceTests/Extensions/Foundation/StringTestCase.swift +++ b/Tests/AlicerceTests/Extensions/Foundation/StringTestCase.swift @@ -66,5 +66,152 @@ class StringTestCase: XCTestCase { XCTAssertEqual(intDump, dumpString) } - + + // replacingOccurrencesOfCharacters(in:skippingCharactersIn:) + + func testReplacingOccurrencesOfCharacters_WithEmptyMap_ShouldReturnSelf() { + + let text = "The quick brown fox jumps over the lazy dog" + + XCTAssertEqual(text.replacingOccurrencesOfCharacters(in: [:], skippingCharactersIn: nil), text) + } + + func testReplacingOccurrencesOfCharacters_WithMatchingCharactersInSingleEntryMapAndNilSkippingCharacterSet_ShouldReplaceOccurrences() { + + let original = "The quick brown fox jumps over the lazy dog" + let expected = "The_quick_brown_fox_jumps_over_the_lazy_dog" + + XCTAssertEqual( + original.replacingOccurrencesOfCharacters(in: [.init(" "): "_"], skippingCharactersIn: nil), + expected + ) + } + + func testReplacingOccurrencesOfCharacters_WithMatchingCharactersInMultiEntryMapAndNilSkippingCharacterSet_ShouldReplaceOccurrences() { + + let original = "0123456789ABCDEF" + let expected = "0123456789abcdef" + + XCTAssertEqual( + original.replacingOccurrencesOfCharacters( + in: [ + .init("A"): "a", + .init("B"): "b", + .init("C"): "c", + .init("D"): "d", + .init("E"): "e", + .init("F"): "f", + ], + skippingCharactersIn: nil + ), + expected + ) + } + + func testReplacingOccurrencesOfCharacters_WithMatchingCharactersInMapAndMatchingCharactersInSkippingCharacterSet_ShouldReplaceOccurrencesAndSkip() { + + let original = "0123456789ABCDEF_0A0B0C0D0E0F0" + let expected = "abcdef_abcdef" + + XCTAssertEqual( + original.replacingOccurrencesOfCharacters( + in: [ + .init("A"): "a", + .init("B"): "b", + .init("C"): "c", + .init("D"): "d", + .init("E"): "e", + .init("F"): "f", + ], + skippingCharactersIn: .decimalDigits + ), + expected + ) + } + + // nonLineBreaking() + + func testNonLineBreaking_WithNoLineBreakingCharactersInString_ShouldReturnSelf() { + + let original = "0123456789ABCDEF" + + XCTAssertEqual(original.nonLineBreaking(), original) + } + + func testNonLineBreaking_WithLineBreakingCharactersInString_ShouldReturnANonLineBreakingVersion() { + + let original = "The quick-brown\(String.emDash)fox\(String.enDash)jumps?over{the}lazy dog" + let expected = + """ + The\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown\ + \(String([.wordJoiner, .emDash, .wordJoiner]))fox\ + \(String([.wordJoiner, .enDash, .wordJoiner]))jumps\ + ?\(String.wordJoiner)over{the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog + """ + + XCTAssertEqual(original.nonLineBreaking(), expected) + } + + func testNonLineBreaking_WithLineBreakingCharactersAndNewlinesInStringAndNilNewlineReplacement_ShouldReturnANonLineBreakingVersionAndPreserveNewlines() { + + let original = + """ + \nThe quick-brown\u{85}\(String.emDash)fox\n\(String.enDash)jumps?\u{2028}\u{2029}over{the}lazy dog\n\ + \u{A}.\u{B},\u{C};\u{D} + """ + + let expected = + """ + \nThe\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown\u{85}\ + \(String([.wordJoiner, .emDash, .wordJoiner]))fox\n\ + \(String([.wordJoiner, .enDash, .wordJoiner]))jumps\ + ?\(String.wordJoiner)\u{2028}\u{2029}over\ + {the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog\n\ + \u{A}.\u{B},\u{C};\u{D} + """ + + XCTAssertEqual(original.nonLineBreaking(replacingNewlinesWith: nil), expected) + } + + func testNonLineBreaking_WithLineBreakingCharactersAndNewlinesInStringAndEmptyStringNewlineReplacement_ShouldReturnANonLineBreakingVersionAndReplaceNewlines() { + + let original = + """ + \nThe quick-brown\u{85}\(String.emDash)fox\n\(String.enDash)jumps?\u{2028}\u{2029}over{the}lazy dog\n\ + \u{A}.\u{B},\u{C};\u{D} + """ + + let expected = + """ + The\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown\ + \(String([.wordJoiner, .emDash, .wordJoiner]))fox\ + \(String([.wordJoiner, .enDash, .wordJoiner]))jumps\ + ?\(String.wordJoiner)over\ + {the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog\ + .,; + """ + + XCTAssertEqual(original.nonLineBreaking(replacingNewlinesWith: ""), expected) + } + + func testNonLineBreaking_WithLineBreakingCharactersAndNewlinesInStringAndNonNilStringNewlineReplacement_ShouldReturnANonLineBreakingVersionAndReplaceNewlines() { + + let original = + """ + \nThe quick-brown\u{85}\(String.emDash)fox\n\(String.enDash)jumps?\u{2028}\u{2029}over{the}lazy dog\n\ + \u{A}.\u{B},\u{C};\u{D} + """ + + let expected = + """ + 🦊The\(String.nonBreakingSpace)quick\(String.nonBreakingHyphen)brown🦊\ + \(String([.wordJoiner, .emDash, .wordJoiner]))fox🦊\ + \(String([.wordJoiner, .enDash, .wordJoiner]))jumps\ + ?\(String.wordJoiner)🦊🦊over\ + {the}\(String.wordJoiner)lazy\(String.nonBreakingSpace)dog🦊\ + 🦊.🦊,🦊;🦊 + """ + + XCTAssertEqual(original.nonLineBreaking(replacingNewlinesWith: "🦊"), expected) + } }