From 8929d2e599e703fd22a19dbc513bec97f166cc8b Mon Sep 17 00:00:00 2001 From: Henrik Widlund <4659350+henrikwidlund@users.noreply.github.com> Date: Tue, 3 Aug 2021 12:25:40 +0200 Subject: [PATCH 1/3] Refactor naming --- .../BenchmarkCollectionUtilities.cs | 10 ++--- .../BenchmarkHostUtilities.cs | 28 ++++++------ HostsParser.Benchmarks/BenchmarkStreamBase.cs | 4 +- HostsParser.Benchmarks/BenchmarkTestData.cs | 4 +- .../HostsParser.Benchmarks.csproj | 4 +- .../{adguardhosts.txt => adbockbased.txt} | 0 HostsParser.Benchmarks/appsettings.json | 44 ++++++++++--------- .../{sourcehosts.txt => hostsbased.txt} | 0 HostsParser/HostUtilities.cs | 22 +++++----- HostsParser/Program.cs | 26 +++++------ HostsParser/Settings.cs | 15 ++++--- HostsParser/appsettings.json | 44 ++++++++++--------- 12 files changed, 106 insertions(+), 95 deletions(-) rename HostsParser.Benchmarks/{adguardhosts.txt => adbockbased.txt} (100%) rename HostsParser.Benchmarks/{sourcehosts.txt => hostsbased.txt} (100%) diff --git a/HostsParser.Benchmarks/BenchmarkCollectionUtilities.cs b/HostsParser.Benchmarks/BenchmarkCollectionUtilities.cs index 1f65144a..18812ae3 100644 --- a/HostsParser.Benchmarks/BenchmarkCollectionUtilities.cs +++ b/HostsParser.Benchmarks/BenchmarkCollectionUtilities.cs @@ -47,19 +47,19 @@ public abstract class BenchmarkCollectionUtilitiesBase : BenchmarkStreamBase public IEnumerable> Source() { var stream = PrepareStream(); - var source = HostUtilities - .ProcessSource(stream, BenchmarkTestData.Settings.SkipLinesBytes, + var hostsBasedLines = HostUtilities + .ProcessHostsBased(stream, BenchmarkTestData.Settings.HostsBased.SkipLinesBytes, BenchmarkTestData.Decoder).GetAwaiter().GetResult(); stream = PrepareStream(); - var adGuard = HostUtilities.ProcessAdGuard(stream, BenchmarkTestData.Decoder) + var adBlockBasedLines = HostUtilities.ProcessAdBlockBased(stream, BenchmarkTestData.Decoder) .GetAwaiter().GetResult(); stream.Dispose(); - source.UnionWith(adGuard); + hostsBasedLines.UnionWith(adBlockBasedLines); - yield return source; + yield return hostsBasedLines; } } } \ No newline at end of file diff --git a/HostsParser.Benchmarks/BenchmarkHostUtilities.cs b/HostsParser.Benchmarks/BenchmarkHostUtilities.cs index 40e57ecd..65095437 100644 --- a/HostsParser.Benchmarks/BenchmarkHostUtilities.cs +++ b/HostsParser.Benchmarks/BenchmarkHostUtilities.cs @@ -10,7 +10,7 @@ namespace HostsParser.Benchmarks { [MemoryDiagnoser] [BenchmarkCategory(nameof(HostUtilities))] - public class BenchmarkProcessSource : BenchmarkStreamBase + public class BenchmarkProcessHostsBased : BenchmarkStreamBase { private Stream _stream; @@ -21,18 +21,18 @@ public class BenchmarkProcessSource : BenchmarkStreamBase public void IterationCleanup() => _stream?.Dispose(); [Benchmark] - [BenchmarkCategory(nameof(ProcessSource), nameof(HostUtilities))] - public async Task> ProcessSource() + [BenchmarkCategory(nameof(ProcessHostsBased), nameof(HostUtilities))] + public async Task> ProcessHostsBased() { - return await HostUtilities.ProcessSource(_stream, - BenchmarkTestData.Settings.SkipLinesBytes, + return await HostUtilities.ProcessHostsBased(_stream, + BenchmarkTestData.Settings.HostsBased.SkipLinesBytes, BenchmarkTestData.Decoder); } } [MemoryDiagnoser] [BenchmarkCategory(nameof(HostUtilities))] - public class BenchmarkProcessAdGuard : BenchmarkStreamBase + public class BenchmarkProcessAdBlockBased : BenchmarkStreamBase { private Stream _stream; @@ -43,9 +43,9 @@ public class BenchmarkProcessAdGuard : BenchmarkStreamBase public void IterationCleanup() => _stream?.Dispose(); [Benchmark] - [BenchmarkCategory(nameof(ProcessAdGuard), nameof(HostUtilities))] - public async Task> ProcessAdGuard() - => await HostUtilities.ProcessAdGuard(_stream, BenchmarkTestData.Decoder); + [BenchmarkCategory(nameof(ProcessAdBlockBased), nameof(HostUtilities))] + public async Task> ProcessAdBlockBased() + => await HostUtilities.ProcessAdBlockBased(_stream, BenchmarkTestData.Decoder); } [MemoryDiagnoser] @@ -61,18 +61,18 @@ public void RemoveKnownBadHosts(HashSet data) public IEnumerable> Source() { var stream = PrepareStream(); - var source = HostUtilities - .ProcessSource(stream, BenchmarkTestData.Settings.SkipLinesBytes, + var hostsBasedLines = HostUtilities + .ProcessHostsBased(stream, BenchmarkTestData.Settings.HostsBased.SkipLinesBytes, BenchmarkTestData.Decoder).GetAwaiter().GetResult(); stream = PrepareStream(); - var adGuard = HostUtilities.ProcessAdGuard(stream, BenchmarkTestData.Decoder) + var adBlockBasedLines = HostUtilities.ProcessAdBlockBased(stream, BenchmarkTestData.Decoder) .GetAwaiter().GetResult(); stream.Dispose(); - source.UnionWith(adGuard); - yield return source; + hostsBasedLines.UnionWith(adBlockBasedLines); + yield return hostsBasedLines; } } } \ No newline at end of file diff --git a/HostsParser.Benchmarks/BenchmarkStreamBase.cs b/HostsParser.Benchmarks/BenchmarkStreamBase.cs index 98d619b6..d2f2ec2b 100644 --- a/HostsParser.Benchmarks/BenchmarkStreamBase.cs +++ b/HostsParser.Benchmarks/BenchmarkStreamBase.cs @@ -13,8 +13,8 @@ protected static Stream PrepareStream() var stream = new MemoryStream(); using var sw = new BinaryWriter(stream, Encoding.UTF8, true); - sw.Write(BenchmarkTestData.SourceTestBytes); - sw.Write(BenchmarkTestData.AdGuardTestBytes); + sw.Write(BenchmarkTestData.HostsBasedTestBytes); + sw.Write(BenchmarkTestData.AdBlockBasedTestBytes); sw.Flush(); stream.Seek(0, SeekOrigin.Begin); diff --git a/HostsParser.Benchmarks/BenchmarkTestData.cs b/HostsParser.Benchmarks/BenchmarkTestData.cs index 8f217a91..bb743e95 100644 --- a/HostsParser.Benchmarks/BenchmarkTestData.cs +++ b/HostsParser.Benchmarks/BenchmarkTestData.cs @@ -9,8 +9,8 @@ namespace HostsParser.Benchmarks { internal static class BenchmarkTestData { - public static readonly byte[] SourceTestBytes = File.ReadAllBytes("sourcehosts.txt"); - public static readonly byte[] AdGuardTestBytes = File.ReadAllBytes("adguardhosts.txt"); + public static readonly byte[] HostsBasedTestBytes = File.ReadAllBytes("hostsbased.txt"); + public static readonly byte[] AdBlockBasedTestBytes = File.ReadAllBytes("adbockbased.txt"); public static readonly Settings Settings = JsonSerializer.Deserialize(File.ReadAllBytes("appsettings.json")); diff --git a/HostsParser.Benchmarks/HostsParser.Benchmarks.csproj b/HostsParser.Benchmarks/HostsParser.Benchmarks.csproj index a7b24440..b96f5135 100644 --- a/HostsParser.Benchmarks/HostsParser.Benchmarks.csproj +++ b/HostsParser.Benchmarks/HostsParser.Benchmarks.csproj @@ -14,10 +14,10 @@ - + Always - + Always diff --git a/HostsParser.Benchmarks/adguardhosts.txt b/HostsParser.Benchmarks/adbockbased.txt similarity index 100% rename from HostsParser.Benchmarks/adguardhosts.txt rename to HostsParser.Benchmarks/adbockbased.txt diff --git a/HostsParser.Benchmarks/appsettings.json b/HostsParser.Benchmarks/appsettings.json index 696aad24..a4e2b196 100644 --- a/HostsParser.Benchmarks/appsettings.json +++ b/HostsParser.Benchmarks/appsettings.json @@ -1,25 +1,29 @@ { - "SourceUri": "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts", - "AdGuardUri": "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt", + "HostsBased": { + "SourceUri": "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts", + "SkipLines": [ + "127.0.0.1 localhost", + "127.0.0.1 localhost.localdomain", + "127.0.0.1 local", + "255.255.255.255 broadcasthost", + "::1 localhost", + "::1 ip6-localhost", + "::1 ip6-loopback", + "fe80::1%lo0 localhost", + "ff00::0 ip6-localnet", + "ff00::0 ip6-mcastprefix", + "ff02::1 ip6-allnodes", + "ff02::2 ip6-allrouters", + "ff02::3 ip6-allhosts", + "0.0.0.0 0.0.0.0", + "0.0.0.0 fe #00::0 ip6-localnet", + "0.0.0.0 ff #00::0 ip6-mcastprefix" + ] + }, + "AdBlockBased": { + "SourceUri": "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt" + }, "ExtraFiltering": false, - "SkipLines": [ - "127.0.0.1 localhost", - "127.0.0.1 localhost.localdomain", - "127.0.0.1 local", - "255.255.255.255 broadcasthost", - "::1 localhost", - "::1 ip6-localhost", - "::1 ip6-loopback", - "fe80::1%lo0 localhost", - "ff00::0 ip6-localnet", - "ff00::0 ip6-mcastprefix", - "ff02::1 ip6-allnodes", - "ff02::2 ip6-allrouters", - "ff02::3 ip6-allhosts", - "0.0.0.0 0.0.0.0", - "0.0.0.0 fe #00::0 ip6-localnet", - "0.0.0.0 ff #00::0 ip6-mcastprefix" - ], "HeaderLines": [ "! Copyright Henrik Widlund https://github.com/henrikwidlund/HostsParser/blob/main/LICENSE", "! All content below commented lines are based on StevenBlack/hosts and AdGuard DNS filter.", diff --git a/HostsParser.Benchmarks/sourcehosts.txt b/HostsParser.Benchmarks/hostsbased.txt similarity index 100% rename from HostsParser.Benchmarks/sourcehosts.txt rename to HostsParser.Benchmarks/hostsbased.txt diff --git a/HostsParser/HostUtilities.cs b/HostsParser/HostUtilities.cs index 6d582e08..76c269fe 100644 --- a/HostsParser/HostUtilities.cs +++ b/HostsParser/HostUtilities.cs @@ -16,8 +16,8 @@ internal static class HostUtilities { private static readonly Memory Cache = new char[256]; - internal static async Task> ProcessSource(Stream stream, - byte[][] skipLines, + internal static async Task> ProcessHostsBased(Stream stream, + byte[][]? skipLines, Decoder decoder) { var pipeReader = PipeReader.Create(stream); @@ -26,7 +26,7 @@ internal static async Task> ProcessSource(Stream stream, return dnsList; } - internal static async Task> ProcessAdGuard(Stream stream, + internal static async Task> ProcessAdBlockBased(Stream stream, Decoder decoder) { var pipeReader = PipeReader.Create(stream); @@ -109,12 +109,12 @@ private static void ProcessLine(in ReadOnlySequence slice, Decoder decoder) { if (skipLines == null) - ProcessAdGuardLine(slice, resultCollection, decoder); + ProcessAdBlockBasedLine(slice, resultCollection, decoder); else - ProcessSourceLine(slice, resultCollection, skipLines, decoder); + ProcessHostsBasedLine(slice, resultCollection, skipLines, decoder); } - private static void ProcessSourceLine(in ReadOnlySequence slice, + private static void ProcessHostsBasedLine(in ReadOnlySequence slice, ICollection resultCollection, byte[][] skipLines, Decoder decoder) @@ -128,7 +128,7 @@ private static void ProcessSourceLine(in ReadOnlySequence slice, if (realSlice[0] == Constants.HashSign) return; - if (SourceShouldSkipLine(realSlice, skipLines)) + if (HostsBasedShouldSkipLine(realSlice, skipLines)) return; realSlice = HandleWwwPrefix(realSlice); @@ -140,7 +140,7 @@ private static void ProcessSourceLine(in ReadOnlySequence slice, resultCollection.Add(Cache.Span[..realSlice.Length].Trim().ToString()); } - private static void ProcessAdGuardLine(in ReadOnlySequence slice, + private static void ProcessAdBlockBasedLine(in ReadOnlySequence slice, ICollection resultCollection, Decoder decoder) { @@ -150,7 +150,7 @@ private static void ProcessAdGuardLine(in ReadOnlySequence slice, if (realSlice.IsEmpty) return; - if (AdGuardShouldSkipLine(realSlice)) + if (AdBlockBasedShouldSkipLine(realSlice)) return; realSlice = HandlePipe(realSlice); @@ -162,7 +162,7 @@ private static void ProcessAdGuardLine(in ReadOnlySequence slice, resultCollection.Add(Cache.Span[..realSlice.Length].ToString()); } - private static bool SourceShouldSkipLine(in ReadOnlySpan bytes, + private static bool HostsBasedShouldSkipLine(in ReadOnlySpan bytes, byte[][] skipLines) { if (TrimStart(bytes)[0] == Constants.HashSign) @@ -178,7 +178,7 @@ private static bool SourceShouldSkipLine(in ReadOnlySpan bytes, } [MethodImpl(MethodImplOptions.AggressiveInlining)] - private static bool AdGuardShouldSkipLine(in ReadOnlySpan current) + private static bool AdBlockBasedShouldSkipLine(in ReadOnlySpan current) => current[0] != Constants.PipeSign; private static ReadOnlySpan TrimStart(in this ReadOnlySpan span) diff --git a/HostsParser/Program.cs b/HostsParser/Program.cs index 7d07db66..a93fa54b 100644 --- a/HostsParser/Program.cs +++ b/HostsParser/Program.cs @@ -44,29 +44,29 @@ internal static async Task Main() var decoder = Encoding.UTF8.GetDecoder(); using var httpClient = new HttpClient(); - var stream = await httpClient.GetStreamAsync(settings.SourceUri); - var sourceLines = await HostUtilities.ProcessSource(stream, settings.SkipLinesBytes, decoder); + var stream = await httpClient.GetStreamAsync(settings.HostsBased.SourceUri); + var hostsBasedLines = await HostUtilities.ProcessHostsBased(stream, settings.HostsBased.SkipLinesBytes, decoder); await stream.DisposeAsync(); - stream = await httpClient.GetStreamAsync(settings.AdGuardUri); - var adGuardLines = await HostUtilities.ProcessAdGuard(stream, decoder); + stream = await httpClient.GetStreamAsync(settings.AdBlockBased.SourceUri); + var adBlockBasedLines = await HostUtilities.ProcessAdBlockBased(stream, decoder); await stream.DisposeAsync(); - var combined = sourceLines; - combined.ExceptWith(adGuardLines); + var combined = hostsBasedLines; + combined.ExceptWith(adBlockBasedLines); combined = HostUtilities.RemoveKnownBadHosts(settings.KnownBadHosts, combined); combined.UnionWith(settings.KnownBadHosts); - combined.UnionWith(adGuardLines); + combined.UnionWith(adBlockBasedLines); CollectionUtilities.FilterGrouped(combined); var sortedDnsList = CollectionUtilities.SortDnsList(combined); HashSet filtered = new(combined.Count); - sortedDnsList = ProcessCombined(sortedDnsList, adGuardLines, filtered); + sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filtered); if (settings.ExtraFiltering) { logger.LogInformation(WithTimeStamp("Start extra filtering of duplicates")); - sortedDnsList = ProcessWithExtraFiltering(adGuardLines, sortedDnsList, filtered); + sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filtered); logger.LogInformation(WithTimeStamp("Done extra filtering of duplicates")); } @@ -91,11 +91,11 @@ internal static async Task Main() static string WithTimeStamp(string message) => $"{DateTime.Now:yyyy-MM-dd HH:mm:ss} - {message}"; } - private static List ProcessWithExtraFiltering(HashSet adGuardLines, + private static List ProcessWithExtraFiltering(HashSet adBlockBasedLines, List combined, HashSet filtered) { - Parallel.ForEach(CollectionUtilities.SortDnsList(adGuardLines), item => + Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item => { for (var i = 0; i < combined.Count; i++) { @@ -111,7 +111,7 @@ private static List ProcessWithExtraFiltering(HashSet adGuardLin private static List ProcessCombined( List combined, - HashSet adGuardLines, + HashSet adBlockBasedLines, HashSet filtered) { var round = 0; @@ -130,7 +130,7 @@ private static List ProcessCombined( }); if (round == 1) - combined.RemoveAll(adGuardLines.Contains); + combined.RemoveAll(adBlockBasedLines.Contains); combined.RemoveAll(filtered.Contains); combined = CollectionUtilities.SortDnsList(combined); diff --git a/HostsParser/Settings.cs b/HostsParser/Settings.cs index 3f048230..dee4b437 100644 --- a/HostsParser/Settings.cs +++ b/HostsParser/Settings.cs @@ -7,14 +7,17 @@ namespace HostsParser { - internal record Settings( - Uri SourceUri, - Uri AdGuardUri, - string[] SkipLines, + internal sealed record Settings( + SourceEntry HostsBased, + SourceEntry AdBlockBased, string[] HeaderLines, string[] KnownBadHosts, - bool ExtraFiltering) + bool ExtraFiltering); + + internal sealed record SourceEntry( + Uri SourceUri, + string[]? SkipLines) { - internal byte[][] SkipLinesBytes = SkipLines.Select(s => Encoding.UTF8.GetBytes(s)).ToArray(); + internal byte[][]? SkipLinesBytes = SkipLines?.Select(s => Encoding.UTF8.GetBytes(s)).ToArray(); } } diff --git a/HostsParser/appsettings.json b/HostsParser/appsettings.json index 696aad24..a4e2b196 100644 --- a/HostsParser/appsettings.json +++ b/HostsParser/appsettings.json @@ -1,25 +1,29 @@ { - "SourceUri": "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts", - "AdGuardUri": "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt", + "HostsBased": { + "SourceUri": "https://raw.githubusercontent.com/StevenBlack/hosts/master/alternates/fakenews-gambling-porn/hosts", + "SkipLines": [ + "127.0.0.1 localhost", + "127.0.0.1 localhost.localdomain", + "127.0.0.1 local", + "255.255.255.255 broadcasthost", + "::1 localhost", + "::1 ip6-localhost", + "::1 ip6-loopback", + "fe80::1%lo0 localhost", + "ff00::0 ip6-localnet", + "ff00::0 ip6-mcastprefix", + "ff02::1 ip6-allnodes", + "ff02::2 ip6-allrouters", + "ff02::3 ip6-allhosts", + "0.0.0.0 0.0.0.0", + "0.0.0.0 fe #00::0 ip6-localnet", + "0.0.0.0 ff #00::0 ip6-mcastprefix" + ] + }, + "AdBlockBased": { + "SourceUri": "https://adguardteam.github.io/AdGuardSDNSFilter/Filters/filter.txt" + }, "ExtraFiltering": false, - "SkipLines": [ - "127.0.0.1 localhost", - "127.0.0.1 localhost.localdomain", - "127.0.0.1 local", - "255.255.255.255 broadcasthost", - "::1 localhost", - "::1 ip6-localhost", - "::1 ip6-loopback", - "fe80::1%lo0 localhost", - "ff00::0 ip6-localnet", - "ff00::0 ip6-mcastprefix", - "ff02::1 ip6-allnodes", - "ff02::2 ip6-allrouters", - "ff02::3 ip6-allhosts", - "0.0.0.0 0.0.0.0", - "0.0.0.0 fe #00::0 ip6-localnet", - "0.0.0.0 ff #00::0 ip6-mcastprefix" - ], "HeaderLines": [ "! Copyright Henrik Widlund https://github.com/henrikwidlund/HostsParser/blob/main/LICENSE", "! All content below commented lines are based on StevenBlack/hosts and AdGuard DNS filter.", From 81aaa136c82b3987bbfcf04c9dd83ceeb29947ac Mon Sep 17 00:00:00 2001 From: Henrik Widlund <4659350+henrikwidlund@users.noreply.github.com> Date: Tue, 3 Aug 2021 15:24:30 +0200 Subject: [PATCH 2/3] Update README.md --- README.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 554a558a..37097de7 100644 --- a/README.md +++ b/README.md @@ -1,20 +1,65 @@ # HostsParser [![Build/Publish](https://github.com/henrikwidlund/HostsParser/actions/workflows/publish-hosts.yml/badge.svg)](https://github.com/henrikwidlund/HostsParser/actions/workflows/publish-hosts.yml) -Converts [StevenBlack/hosts](https://github.com/StevenBlack/hosts) with fakenews, gambling and porn extensions into the adblock format, optimized for [AdGuard Home](https://github.com/AdguardTeam/AdGuardHome). It also removes duplicates, hosts that are already blocked by [AdGuard DNS Filter](https://github.com/AdguardTeam/AdGuardSDNSFilter) and most comments that are used to indicate different sections in the source. +Converts a `hosts` ([`HostsBased`](#hostsbased)) based file into a `AdBlock` formatted file, optimized for [AdGuard Home](https://github.com/AdguardTeam/AdGuardHome). +It also removes duplicates, comments as well as hosts that are already blocked by a different `AdBlock` ([`AdBlockBased`](#adblockbased)) based file. -## Pre-built host +By default [StevenBlack/hosts](https://github.com/StevenBlack/hosts) with fakenews, gambling and porn extensions is processed to exclude entries already covered by [AdGuard DNS Filter](https://github.com/AdguardTeam/AdGuardSDNSFilter). + +## Using in AdGuard Home +1. Make sure that `AdAway Default Blocklist` (or any custom `AdBlock` formatted file referenced when running the program) is active in DNS blocklists for your AdGuard Home instance. +2. Copy the link to the [Pre-built host](#pre-built-host) and add it to your DNS blocklists as a custom list in your AdGuard Home instance. + +Please refer to the [AdGuard Home](https://github.com/AdguardTeam/AdGuardHome) repository for further instructions on how to use DNS blocklists. + +**Note** If you've generated your own file, the Pre-built host link should be replaced by the address to where you're hosting it. + +### Pre-built host The hosts file is generated every six hours and is available for download [here](https://henrikwidlund.github.io/HostsParser/hosts). ## Building -*You'll need the [dotnet 6 SDK](https://dotnet.microsoft.com/download).* +### Prerequisites +[dotnet 6 SDK](https://dotnet.microsoft.com/download). -Run `dotnet build --configuration Release` from the directory you cloned the repository to. +Run the following from the directory you cloned the repository to: +```sh +cd HostsParser +dotnet build --configuration Release +``` ## Running -*You'll need the [dotnet 6 runtime](https://dotnet.microsoft.com/download).* +### Prerequisites +1. [dotnet 6 runtime](https://dotnet.microsoft.com/download). +2. Downloaded binaries or binaries built from source. + +Run the following (if you built from source, this will be in `HostsParser/bin/Release/net6.0`): +```sh +dotnet HostsParser.dll +``` + +The program creates a `hosts` file in the same directory. + +## Configuration +You may adjust the configuration of the application by modifying the `appsettings.json` file. + +| Property | Type | Required | Description | +|---|---|---|---| +|[`HostsBased`](#hostsbased)|`object`|`true`|Settings used for processing a hosts formatted source.| +|[`AdBlockBased`](#adblockbased)|`object`|`true`|Settings used for processing a AdBlock formatted source.| +|`ExtraFiltering`|`bool`|`true`|Setting to indicate if extra filtering should be performed.
If `true`, the program will check each element in the result against each other and remove any entry that would be blocked by a more general entry.| +|`HeaderLines`|`string[]`|`true`|Defines a set of lines that will be inserted at the top of the generated file, for example copyright.| +|`KnownBadHosts`|`string[]`|`true`|Array of unwanted hosts. These entries will be added to the result if they're not covered by the `AdBlockBased` entries.
You can also add generalized hosts to reduce the number of entries in final results.
For example: `HostsBased` results might contain `a.baddomain.com` and `b.baddomain.com`, adding `baddomain.com` will remove the sub domain entries and block `baddomain.com` and all of its subdomains.| + +### `HostsBased` +| Property | Type | Required | Description | +|---|---|---|---| +|`SourceUri`|`Uri`|`true`|URI to the hosts based file| +|`SkipLines`|`string[]`|`true`|Array of strings that, if present in the result from `SourceUri` will be filtered out.| -Run `dotnet HostsParser.dll`. Program creates a `hosts` file in the same directory. +### `AdBlockBased` +| Property | Type | Required | Description | +|---|---|---|---| +|`SourceUri`|`Uri`|`true`|URI to the AdBlock based file| ## Licenses - [License](LICENSE) From 81382b2673d069b2f8ce3c8b3708ebf840d936b2 Mon Sep 17 00:00:00 2001 From: Henrik Widlund <4659350+henrikwidlund@users.noreply.github.com> Date: Tue, 3 Aug 2021 16:24:15 +0200 Subject: [PATCH 3/3] Refactor and documentation --- HostsParser/CollectionUtilities.cs | 35 +++++++--- HostsParser/HostUtilities.cs | 23 +++++++ HostsParser/Program.cs | 82 +++++++++++++---------- HostsParser/ReadOnlyMemoryCharComparer.cs | 6 ++ HostsParser/Settings.cs | 22 ++++++ 5 files changed, 121 insertions(+), 47 deletions(-) diff --git a/HostsParser/CollectionUtilities.cs b/HostsParser/CollectionUtilities.cs index 30e91912..37f5628a 100644 --- a/HostsParser/CollectionUtilities.cs +++ b/HostsParser/CollectionUtilities.cs @@ -10,10 +10,14 @@ namespace HostsParser { internal static class CollectionUtilities { - internal static List SortDnsList(ICollection dnsList) + /// + /// Sorts by domain and length. + /// + /// The collection to sort. + internal static List SortDnsList(ICollection dnsCollection) { - List list = new(dnsList.Count); - list.AddRange(dnsList + List list = new(dnsCollection.Count); + list.AddRange(dnsCollection .Select(d => new StringSortItem(d)) .OrderBy(l => GetTopMostDns(l.RawMemory), ReadOnlyMemoryCharComparer.Default) .ThenBy(l => l.RawMemory.Length) @@ -22,12 +26,15 @@ internal static List SortDnsList(ICollection dnsList) return list; } - internal static void FilterGrouped(HashSet dnsList) + /// + /// Filters out all sub domains from for which a domain is contained. + /// + internal static void FilterGrouped(HashSet dnsCollection) { - var cacheHashSet = CreateCacheHashSet(dnsList); + var cacheHashSet = CreateCacheHashSet(dnsCollection); - var dnsGroups = GroupDnsList(dnsList); - HashSet filtered = new(dnsList.Count); + var dnsGroups = GroupDnsList(dnsCollection); + HashSet filtered = new(dnsCollection.Count); foreach (var (key, value) in dnsGroups) { if (!cacheHashSet.Contains(key) @@ -43,13 +50,19 @@ internal static void FilterGrouped(HashSet dnsList) } } - dnsList.ExceptWith(filtered); + dnsCollection.ExceptWith(filtered); } - internal static Dictionary> GroupDnsList(HashSet dnsList) + /// + /// Groups into a dictionary where the key is the main domain + /// and value is a list of found sub domains. + /// + /// The collection used for grouping. + /// + internal static Dictionary> GroupDnsList(HashSet dnsCollection) { - var dict = new Dictionary>(dnsList.Count); - foreach (var s in dnsList) + var dict = new Dictionary>(dnsCollection.Count); + foreach (var s in dnsCollection) { var key = string.GetHashCode(GetTopMostDns(s)); List values; diff --git a/HostsParser/HostUtilities.cs b/HostsParser/HostUtilities.cs index 76c269fe..1c01cbae 100644 --- a/HostsParser/HostUtilities.cs +++ b/HostsParser/HostUtilities.cs @@ -16,25 +16,43 @@ internal static class HostUtilities { private static readonly Memory Cache = new char[256]; + /// + /// Reads the and returns a collection based on the items in it. + /// + /// The to process. + /// The lines that should be excluded from the returned result. + /// The used when converting the bytes in . internal static async Task> ProcessHostsBased(Stream stream, byte[][]? skipLines, Decoder decoder) { var pipeReader = PipeReader.Create(stream); + // Assumed length to reduce allocations var dnsList = new HashSet(140_000); await ReadPipeAsync(pipeReader, dnsList, skipLines, decoder); return dnsList; } + /// + /// Reads the and returns a collection based on the items in it. + /// + /// The to process. + /// The used when converting the bytes in . internal static async Task> ProcessAdBlockBased(Stream stream, Decoder decoder) { var pipeReader = PipeReader.Create(stream); + // Assumed length to reduce allocations var dnsList = new HashSet(50_000); await ReadPipeAsync(pipeReader, dnsList, null, decoder); return dnsList; } + /// + /// Removes all sub domains to the entries in from the . + /// + /// Array of hosts used for removing sub domains. + /// The collection of hosts that sub domains should be removed from. internal static HashSet RemoveKnownBadHosts(string[] knownBadHosts, HashSet hosts) { @@ -58,6 +76,11 @@ internal static HashSet RemoveKnownBadHosts(string[] knownBadHosts, return hosts; } + /// + /// Checks if is a sub domain of . + /// + /// The potential sub domain. + /// The potential domain. [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsSubDomainOf(in ReadOnlySpan potentialSubDomain, in ReadOnlySpan potentialDomain) diff --git a/HostsParser/Program.cs b/HostsParser/Program.cs index a93fa54b..5931dae7 100644 --- a/HostsParser/Program.cs +++ b/HostsParser/Program.cs @@ -60,13 +60,13 @@ internal static async Task Main() CollectionUtilities.FilterGrouped(combined); var sortedDnsList = CollectionUtilities.SortDnsList(combined); - HashSet filtered = new(combined.Count); - sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filtered); + HashSet filteredCache = new(combined.Count); + sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filteredCache); if (settings.ExtraFiltering) { logger.LogInformation(WithTimeStamp("Start extra filtering of duplicates")); - sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filtered); + sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filteredCache); logger.LogInformation(WithTimeStamp("Done extra filtering of duplicates")); } @@ -90,62 +90,72 @@ internal static async Task Main() static string WithTimeStamp(string message) => $"{DateTime.Now:yyyy-MM-dd HH:mm:ss} - {message}"; } - - private static List ProcessWithExtraFiltering(HashSet adBlockBasedLines, - List combined, - HashSet filtered) - { - Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item => - { - for (var i = 0; i < combined.Count; i++) - { - var localItem = combined[i]; - if (HostUtilities.IsSubDomainOf(localItem, item)) - filtered.Add(localItem); - } - }); - combined.RemoveAll(filtered.Contains); - combined = CollectionUtilities.SortDnsList(combined); - return combined; - } - + private static List ProcessCombined( - List combined, + List sortedDnsList, HashSet adBlockBasedLines, - HashSet filtered) + HashSet filteredCache) { var round = 0; do { - filtered.Clear(); + filteredCache.Clear(); + // Increase the number of items processed in each run since we'll have fewer items to loop and they'll be further apart. var lookBack = ++round * 250; - Parallel.For(0, combined.Count, i => + Parallel.For(0, sortedDnsList.Count, i => { for (var j = (i < lookBack ? 0 : i - lookBack); j < i; j++) { - var item = combined[i]; - var otherItem = combined[j]; - AddIfSubDomain(filtered, item, otherItem); + var item = sortedDnsList[i]; + var otherItem = sortedDnsList[j]; + AddIfSubDomain(filteredCache, item, otherItem); } }); + // We only need to check for domains/sub domains covered by AdBlock based file + // on first run, after that sub domains covered by AdBlock based file will be gone + // and we don't want to process unnecessary entries or produce a file containing + // lines contained in the AdBlock based file if (round == 1) - combined.RemoveAll(adBlockBasedLines.Contains); + sortedDnsList.RemoveAll(adBlockBasedLines.Contains); - combined.RemoveAll(filtered.Contains); - combined = CollectionUtilities.SortDnsList(combined); - } while (filtered.Count > 0); + sortedDnsList.RemoveAll(filteredCache.Contains); + sortedDnsList = CollectionUtilities.SortDnsList(sortedDnsList); + } while (filteredCache.Count > 0); - return combined; + return sortedDnsList; + } + + /// + /// Removes sub domains covered by a main domain in by looping over + /// all items in and check if any other item in + /// is a sub domain of it. + /// + private static List ProcessWithExtraFiltering(HashSet adBlockBasedLines, + List sortedDnsList, + HashSet filteredCache) + { + Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item => + { + for (var i = 0; i < sortedDnsList.Count; i++) + { + var localItem = sortedDnsList[i]; + if (HostUtilities.IsSubDomainOf(localItem, item)) + filteredCache.Add(localItem); + } + }); + sortedDnsList.RemoveAll(filteredCache.Contains); + sortedDnsList = CollectionUtilities.SortDnsList(sortedDnsList); + return sortedDnsList; } - private static void AddIfSubDomain(HashSet filtered, + private static void AddIfSubDomain(HashSet filteredCache, string item, string otherItem) { if (ShouldSkip(otherItem, item)) return; if (HostUtilities.IsSubDomainOf(item, otherItem)) - filtered.Add(item); + filteredCache.Add(item); } private static bool ShouldSkip(string otherItem, diff --git a/HostsParser/ReadOnlyMemoryCharComparer.cs b/HostsParser/ReadOnlyMemoryCharComparer.cs index 22c8dc9b..968a0bbf 100644 --- a/HostsParser/ReadOnlyMemoryCharComparer.cs +++ b/HostsParser/ReadOnlyMemoryCharComparer.cs @@ -6,8 +6,14 @@ namespace HostsParser { + /// + /// Comparer for -based . + /// public sealed class ReadOnlyMemoryCharComparer : IComparer> { + /// + /// Default instance of . + /// public static readonly ReadOnlyMemoryCharComparer Default = new(); public int Compare(ReadOnlyMemory x, ReadOnlyMemory y) diff --git a/HostsParser/Settings.cs b/HostsParser/Settings.cs index dee4b437..388852bd 100644 --- a/HostsParser/Settings.cs +++ b/HostsParser/Settings.cs @@ -7,6 +7,23 @@ namespace HostsParser { + /// + /// Object used at runtime to represent settings specified in appsettings.json. + /// + /// Settings used for processing a hosts formatted source. + /// Settings used for processing a AdBlock formatted source. + /// Setting to indicate if extra filtering should be performed. + /// If , the program will check each element in the result against each other + /// and remove any entry that would be blocked by a more general entry. + /// + /// Defines a set of lines that will be inserted at + /// the top of the generated file, for example copyright. + /// Array of unwanted hosts. These entries will be added to the result + /// if they're not covered by the AdBlockBased entries. + /// You can also add generalized hosts to reduce the number of entries in final results. + /// HostsBased results might contain a.baddomain.com and b.baddomain.com, adding baddomain.com + /// will remove the sub domain entries and block baddomain.com and all of its subdomains. + /// internal sealed record Settings( SourceEntry HostsBased, SourceEntry AdBlockBased, @@ -14,6 +31,11 @@ internal sealed record Settings( string[] KnownBadHosts, bool ExtraFiltering); + /// + /// Settings used for processing a hosts or AdBlock formatted source. + /// + /// The containing the hosts. + /// Array of strings that, if present in the result from will be filtered out. internal sealed record SourceEntry( Uri SourceUri, string[]? SkipLines)