diff --git a/HostsParser/CollectionUtilities.cs b/HostsParser/CollectionUtilities.cs index 30e91912..37f5628a 100644 --- a/HostsParser/CollectionUtilities.cs +++ b/HostsParser/CollectionUtilities.cs @@ -10,10 +10,14 @@ namespace HostsParser { internal static class CollectionUtilities { - internal static List SortDnsList(ICollection dnsList) + /// + /// Sorts by domain and length. + /// + /// The collection to sort. + internal static List SortDnsList(ICollection dnsCollection) { - List list = new(dnsList.Count); - list.AddRange(dnsList + List list = new(dnsCollection.Count); + list.AddRange(dnsCollection .Select(d => new StringSortItem(d)) .OrderBy(l => GetTopMostDns(l.RawMemory), ReadOnlyMemoryCharComparer.Default) .ThenBy(l => l.RawMemory.Length) @@ -22,12 +26,15 @@ internal static List SortDnsList(ICollection dnsList) return list; } - internal static void FilterGrouped(HashSet dnsList) + /// + /// Filters out all sub domains from for which a domain is contained. + /// + internal static void FilterGrouped(HashSet dnsCollection) { - var cacheHashSet = CreateCacheHashSet(dnsList); + var cacheHashSet = CreateCacheHashSet(dnsCollection); - var dnsGroups = GroupDnsList(dnsList); - HashSet filtered = new(dnsList.Count); + var dnsGroups = GroupDnsList(dnsCollection); + HashSet filtered = new(dnsCollection.Count); foreach (var (key, value) in dnsGroups) { if (!cacheHashSet.Contains(key) @@ -43,13 +50,19 @@ internal static void FilterGrouped(HashSet dnsList) } } - dnsList.ExceptWith(filtered); + dnsCollection.ExceptWith(filtered); } - internal static Dictionary> GroupDnsList(HashSet dnsList) + /// + /// Groups into a dictionary where the key is the main domain + /// and value is a list of found sub domains. + /// + /// The collection used for grouping. + /// + internal static Dictionary> GroupDnsList(HashSet dnsCollection) { - var dict = new Dictionary>(dnsList.Count); - foreach (var s in dnsList) + var dict = new Dictionary>(dnsCollection.Count); + foreach (var s in dnsCollection) { var key = string.GetHashCode(GetTopMostDns(s)); List values; diff --git a/HostsParser/HostUtilities.cs b/HostsParser/HostUtilities.cs index 76c269fe..1c01cbae 100644 --- a/HostsParser/HostUtilities.cs +++ b/HostsParser/HostUtilities.cs @@ -16,25 +16,43 @@ internal static class HostUtilities { private static readonly Memory Cache = new char[256]; + /// + /// Reads the and returns a collection based on the items in it. + /// + /// The to process. + /// The lines that should be excluded from the returned result. + /// The used when converting the bytes in . internal static async Task> ProcessHostsBased(Stream stream, byte[][]? skipLines, Decoder decoder) { var pipeReader = PipeReader.Create(stream); + // Assumed length to reduce allocations var dnsList = new HashSet(140_000); await ReadPipeAsync(pipeReader, dnsList, skipLines, decoder); return dnsList; } + /// + /// Reads the and returns a collection based on the items in it. + /// + /// The to process. + /// The used when converting the bytes in . internal static async Task> ProcessAdBlockBased(Stream stream, Decoder decoder) { var pipeReader = PipeReader.Create(stream); + // Assumed length to reduce allocations var dnsList = new HashSet(50_000); await ReadPipeAsync(pipeReader, dnsList, null, decoder); return dnsList; } + /// + /// Removes all sub domains to the entries in from the . + /// + /// Array of hosts used for removing sub domains. + /// The collection of hosts that sub domains should be removed from. internal static HashSet RemoveKnownBadHosts(string[] knownBadHosts, HashSet hosts) { @@ -58,6 +76,11 @@ internal static HashSet RemoveKnownBadHosts(string[] knownBadHosts, return hosts; } + /// + /// Checks if is a sub domain of . + /// + /// The potential sub domain. + /// The potential domain. [MethodImpl(MethodImplOptions.AggressiveInlining)] internal static bool IsSubDomainOf(in ReadOnlySpan potentialSubDomain, in ReadOnlySpan potentialDomain) diff --git a/HostsParser/Program.cs b/HostsParser/Program.cs index a93fa54b..5931dae7 100644 --- a/HostsParser/Program.cs +++ b/HostsParser/Program.cs @@ -60,13 +60,13 @@ internal static async Task Main() CollectionUtilities.FilterGrouped(combined); var sortedDnsList = CollectionUtilities.SortDnsList(combined); - HashSet filtered = new(combined.Count); - sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filtered); + HashSet filteredCache = new(combined.Count); + sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filteredCache); if (settings.ExtraFiltering) { logger.LogInformation(WithTimeStamp("Start extra filtering of duplicates")); - sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filtered); + sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filteredCache); logger.LogInformation(WithTimeStamp("Done extra filtering of duplicates")); } @@ -90,62 +90,72 @@ internal static async Task Main() static string WithTimeStamp(string message) => $"{DateTime.Now:yyyy-MM-dd HH:mm:ss} - {message}"; } - - private static List ProcessWithExtraFiltering(HashSet adBlockBasedLines, - List combined, - HashSet filtered) - { - Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item => - { - for (var i = 0; i < combined.Count; i++) - { - var localItem = combined[i]; - if (HostUtilities.IsSubDomainOf(localItem, item)) - filtered.Add(localItem); - } - }); - combined.RemoveAll(filtered.Contains); - combined = CollectionUtilities.SortDnsList(combined); - return combined; - } - + private static List ProcessCombined( - List combined, + List sortedDnsList, HashSet adBlockBasedLines, - HashSet filtered) + HashSet filteredCache) { var round = 0; do { - filtered.Clear(); + filteredCache.Clear(); + // Increase the number of items processed in each run since we'll have fewer items to loop and they'll be further apart. var lookBack = ++round * 250; - Parallel.For(0, combined.Count, i => + Parallel.For(0, sortedDnsList.Count, i => { for (var j = (i < lookBack ? 0 : i - lookBack); j < i; j++) { - var item = combined[i]; - var otherItem = combined[j]; - AddIfSubDomain(filtered, item, otherItem); + var item = sortedDnsList[i]; + var otherItem = sortedDnsList[j]; + AddIfSubDomain(filteredCache, item, otherItem); } }); + // We only need to check for domains/sub domains covered by AdBlock based file + // on first run, after that sub domains covered by AdBlock based file will be gone + // and we don't want to process unnecessary entries or produce a file containing + // lines contained in the AdBlock based file if (round == 1) - combined.RemoveAll(adBlockBasedLines.Contains); + sortedDnsList.RemoveAll(adBlockBasedLines.Contains); - combined.RemoveAll(filtered.Contains); - combined = CollectionUtilities.SortDnsList(combined); - } while (filtered.Count > 0); + sortedDnsList.RemoveAll(filteredCache.Contains); + sortedDnsList = CollectionUtilities.SortDnsList(sortedDnsList); + } while (filteredCache.Count > 0); - return combined; + return sortedDnsList; + } + + /// + /// Removes sub domains covered by a main domain in by looping over + /// all items in and check if any other item in + /// is a sub domain of it. + /// + private static List ProcessWithExtraFiltering(HashSet adBlockBasedLines, + List sortedDnsList, + HashSet filteredCache) + { + Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item => + { + for (var i = 0; i < sortedDnsList.Count; i++) + { + var localItem = sortedDnsList[i]; + if (HostUtilities.IsSubDomainOf(localItem, item)) + filteredCache.Add(localItem); + } + }); + sortedDnsList.RemoveAll(filteredCache.Contains); + sortedDnsList = CollectionUtilities.SortDnsList(sortedDnsList); + return sortedDnsList; } - private static void AddIfSubDomain(HashSet filtered, + private static void AddIfSubDomain(HashSet filteredCache, string item, string otherItem) { if (ShouldSkip(otherItem, item)) return; if (HostUtilities.IsSubDomainOf(item, otherItem)) - filtered.Add(item); + filteredCache.Add(item); } private static bool ShouldSkip(string otherItem, diff --git a/HostsParser/ReadOnlyMemoryCharComparer.cs b/HostsParser/ReadOnlyMemoryCharComparer.cs index 22c8dc9b..968a0bbf 100644 --- a/HostsParser/ReadOnlyMemoryCharComparer.cs +++ b/HostsParser/ReadOnlyMemoryCharComparer.cs @@ -6,8 +6,14 @@ namespace HostsParser { + /// + /// Comparer for -based . + /// public sealed class ReadOnlyMemoryCharComparer : IComparer> { + /// + /// Default instance of . + /// public static readonly ReadOnlyMemoryCharComparer Default = new(); public int Compare(ReadOnlyMemory x, ReadOnlyMemory y) diff --git a/HostsParser/Settings.cs b/HostsParser/Settings.cs index dee4b437..388852bd 100644 --- a/HostsParser/Settings.cs +++ b/HostsParser/Settings.cs @@ -7,6 +7,23 @@ namespace HostsParser { + /// + /// Object used at runtime to represent settings specified in appsettings.json. + /// + /// Settings used for processing a hosts formatted source. + /// Settings used for processing a AdBlock formatted source. + /// Setting to indicate if extra filtering should be performed. + /// If , the program will check each element in the result against each other + /// and remove any entry that would be blocked by a more general entry. + /// + /// Defines a set of lines that will be inserted at + /// the top of the generated file, for example copyright. + /// Array of unwanted hosts. These entries will be added to the result + /// if they're not covered by the AdBlockBased entries. + /// You can also add generalized hosts to reduce the number of entries in final results. + /// HostsBased results might contain a.baddomain.com and b.baddomain.com, adding baddomain.com + /// will remove the sub domain entries and block baddomain.com and all of its subdomains. + /// internal sealed record Settings( SourceEntry HostsBased, SourceEntry AdBlockBased, @@ -14,6 +31,11 @@ internal sealed record Settings( string[] KnownBadHosts, bool ExtraFiltering); + /// + /// Settings used for processing a hosts or AdBlock formatted source. + /// + /// The containing the hosts. + /// Array of strings that, if present in the result from will be filtered out. internal sealed record SourceEntry( Uri SourceUri, string[]? SkipLines)