Skip to content

Commit

Permalink
Refactor and documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
henrikwidlund committed Aug 3, 2021
1 parent 81aaa13 commit 81382b2
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 47 deletions.
35 changes: 24 additions & 11 deletions HostsParser/CollectionUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,14 @@ namespace HostsParser
{
internal static class CollectionUtilities
{
internal static List<string> SortDnsList(ICollection<string> dnsList)
/// <summary>
/// Sorts <paramref name="dnsCollection"/> by domain and length.
/// </summary>
/// <param name="dnsCollection">The collection to sort.</param>
internal static List<string> SortDnsList(ICollection<string> dnsCollection)
{
List<string> list = new(dnsList.Count);
list.AddRange(dnsList
List<string> list = new(dnsCollection.Count);
list.AddRange(dnsCollection
.Select(d => new StringSortItem(d))
.OrderBy(l => GetTopMostDns(l.RawMemory), ReadOnlyMemoryCharComparer.Default)
.ThenBy(l => l.RawMemory.Length)
Expand All @@ -22,12 +26,15 @@ internal static List<string> SortDnsList(ICollection<string> dnsList)
return list;
}

internal static void FilterGrouped(HashSet<string> dnsList)
/// <summary>
/// Filters out all sub domains from <paramref name="dnsCollection"/> for which a domain is contained.
/// </summary>
internal static void FilterGrouped(HashSet<string> dnsCollection)
{
var cacheHashSet = CreateCacheHashSet(dnsList);
var cacheHashSet = CreateCacheHashSet(dnsCollection);

var dnsGroups = GroupDnsList(dnsList);
HashSet<string> filtered = new(dnsList.Count);
var dnsGroups = GroupDnsList(dnsCollection);
HashSet<string> filtered = new(dnsCollection.Count);
foreach (var (key, value) in dnsGroups)
{
if (!cacheHashSet.Contains(key)
Expand All @@ -43,13 +50,19 @@ internal static void FilterGrouped(HashSet<string> dnsList)
}
}

dnsList.ExceptWith(filtered);
dnsCollection.ExceptWith(filtered);
}

internal static Dictionary<int, List<string>> GroupDnsList(HashSet<string> dnsList)
/// <summary>
/// Groups <paramref name="dnsCollection"/> into a dictionary where the key is the main domain
/// and value is a list of found sub domains.
/// </summary>
/// <param name="dnsCollection">The collection used for grouping.</param>
/// <returns></returns>
internal static Dictionary<int, List<string>> GroupDnsList(HashSet<string> dnsCollection)
{
var dict = new Dictionary<int, List<string>>(dnsList.Count);
foreach (var s in dnsList)
var dict = new Dictionary<int, List<string>>(dnsCollection.Count);
foreach (var s in dnsCollection)
{
var key = string.GetHashCode(GetTopMostDns(s));
List<string> values;
Expand Down
23 changes: 23 additions & 0 deletions HostsParser/HostUtilities.cs
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,43 @@ internal static class HostUtilities
{
private static readonly Memory<char> Cache = new char[256];

/// <summary>
/// Reads the <paramref name="stream"/> and returns a collection based on the items in it.
/// </summary>
/// <param name="stream">The <see cref="Stream"/> to process.</param>
/// <param name="skipLines">The lines that should be excluded from the returned result.</param>
/// <param name="decoder">The <see cref="Decoder"/> used when converting the bytes in <paramref name="stream"/>.</param>
internal static async Task<HashSet<string>> ProcessHostsBased(Stream stream,
byte[][]? skipLines,
Decoder decoder)
{
var pipeReader = PipeReader.Create(stream);
// Assumed length to reduce allocations
var dnsList = new HashSet<string>(140_000);
await ReadPipeAsync(pipeReader, dnsList, skipLines, decoder);
return dnsList;
}

/// <summary>
/// Reads the <paramref name="stream"/> and returns a collection based on the items in it.
/// </summary>
/// <param name="stream">The <see cref="Stream"/> to process.</param>
/// <param name="decoder">The <see cref="Decoder"/> used when converting the bytes in <paramref name="stream"/>.</param>
internal static async Task<HashSet<string>> ProcessAdBlockBased(Stream stream,
Decoder decoder)
{
var pipeReader = PipeReader.Create(stream);
// Assumed length to reduce allocations
var dnsList = new HashSet<string>(50_000);
await ReadPipeAsync(pipeReader, dnsList, null, decoder);
return dnsList;
}

/// <summary>
/// Removes all sub domains to the entries in <paramref name="knownBadHosts"/> from the <paramref name="hosts"/>.
/// </summary>
/// <param name="knownBadHosts">Array of hosts used for removing sub domains.</param>
/// <param name="hosts">The collection of hosts that sub domains should be removed from.</param>
internal static HashSet<string> RemoveKnownBadHosts(string[] knownBadHosts,
HashSet<string> hosts)
{
Expand All @@ -58,6 +76,11 @@ internal static HashSet<string> RemoveKnownBadHosts(string[] knownBadHosts,
return hosts;
}

/// <summary>
/// Checks if <paramref name="potentialSubDomain"/> is a sub domain of <paramref name="potentialDomain"/>.
/// </summary>
/// <param name="potentialSubDomain">The potential sub domain.</param>
/// <param name="potentialDomain">The potential domain.</param>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
internal static bool IsSubDomainOf(in ReadOnlySpan<char> potentialSubDomain,
in ReadOnlySpan<char> potentialDomain)
Expand Down
82 changes: 46 additions & 36 deletions HostsParser/Program.cs
Original file line number Diff line number Diff line change
Expand Up @@ -60,13 +60,13 @@ internal static async Task Main()
CollectionUtilities.FilterGrouped(combined);

var sortedDnsList = CollectionUtilities.SortDnsList(combined);
HashSet<string> filtered = new(combined.Count);
sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filtered);
HashSet<string> filteredCache = new(combined.Count);
sortedDnsList = ProcessCombined(sortedDnsList, adBlockBasedLines, filteredCache);

if (settings.ExtraFiltering)
{
logger.LogInformation(WithTimeStamp("Start extra filtering of duplicates"));
sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filtered);
sortedDnsList = ProcessWithExtraFiltering(adBlockBasedLines, sortedDnsList, filteredCache);
logger.LogInformation(WithTimeStamp("Done extra filtering of duplicates"));
}

Expand All @@ -90,62 +90,72 @@ internal static async Task Main()

static string WithTimeStamp(string message) => $"{DateTime.Now:yyyy-MM-dd HH:mm:ss} - {message}";
}

private static List<string> ProcessWithExtraFiltering(HashSet<string> adBlockBasedLines,
List<string> combined,
HashSet<string> filtered)
{
Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item =>
{
for (var i = 0; i < combined.Count; i++)
{
var localItem = combined[i];
if (HostUtilities.IsSubDomainOf(localItem, item))
filtered.Add(localItem);
}
});
combined.RemoveAll(filtered.Contains);
combined = CollectionUtilities.SortDnsList(combined);
return combined;
}


private static List<string> ProcessCombined(
List<string> combined,
List<string> sortedDnsList,
HashSet<string> adBlockBasedLines,
HashSet<string> filtered)
HashSet<string> filteredCache)
{
var round = 0;
do
{
filtered.Clear();
filteredCache.Clear();
// Increase the number of items processed in each run since we'll have fewer items to loop and they'll be further apart.
var lookBack = ++round * 250;
Parallel.For(0, combined.Count, i =>
Parallel.For(0, sortedDnsList.Count, i =>
{
for (var j = (i < lookBack ? 0 : i - lookBack); j < i; j++)
{
var item = combined[i];
var otherItem = combined[j];
AddIfSubDomain(filtered, item, otherItem);
var item = sortedDnsList[i];
var otherItem = sortedDnsList[j];
AddIfSubDomain(filteredCache, item, otherItem);
}
});

// We only need to check for domains/sub domains covered by AdBlock based file
// on first run, after that sub domains covered by AdBlock based file will be gone
// and we don't want to process unnecessary entries or produce a file containing
// lines contained in the AdBlock based file
if (round == 1)
combined.RemoveAll(adBlockBasedLines.Contains);
sortedDnsList.RemoveAll(adBlockBasedLines.Contains);

combined.RemoveAll(filtered.Contains);
combined = CollectionUtilities.SortDnsList(combined);
} while (filtered.Count > 0);
sortedDnsList.RemoveAll(filteredCache.Contains);
sortedDnsList = CollectionUtilities.SortDnsList(sortedDnsList);
} while (filteredCache.Count > 0);

return combined;
return sortedDnsList;
}

/// <summary>
/// Removes sub domains covered by a main domain in <paramref name="sortedDnsList"/> by looping over
/// all items in <paramref name="sortedDnsList"/> and check if any other item in
/// <paramref name="sortedDnsList"/> is a sub domain of it.
/// </summary>
private static List<string> ProcessWithExtraFiltering(HashSet<string> adBlockBasedLines,
List<string> sortedDnsList,
HashSet<string> filteredCache)
{
Parallel.ForEach(CollectionUtilities.SortDnsList(adBlockBasedLines), item =>
{
for (var i = 0; i < sortedDnsList.Count; i++)
{
var localItem = sortedDnsList[i];
if (HostUtilities.IsSubDomainOf(localItem, item))
filteredCache.Add(localItem);
}
});
sortedDnsList.RemoveAll(filteredCache.Contains);
sortedDnsList = CollectionUtilities.SortDnsList(sortedDnsList);
return sortedDnsList;
}

private static void AddIfSubDomain(HashSet<string> filtered,
private static void AddIfSubDomain(HashSet<string> filteredCache,
string item,
string otherItem)
{
if (ShouldSkip(otherItem, item)) return;
if (HostUtilities.IsSubDomainOf(item, otherItem))
filtered.Add(item);
filteredCache.Add(item);
}

private static bool ShouldSkip(string otherItem,
Expand Down
6 changes: 6 additions & 0 deletions HostsParser/ReadOnlyMemoryCharComparer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,14 @@

namespace HostsParser
{
/// <summary>
/// Comparer for <see cref="char"/>-based <see cref="ReadOnlyMemory{T}"/>.
/// </summary>
public sealed class ReadOnlyMemoryCharComparer : IComparer<ReadOnlyMemory<char>>
{
/// <summary>
/// Default instance of <see cref="ReadOnlyMemoryCharComparer"/>.
/// </summary>
public static readonly ReadOnlyMemoryCharComparer Default = new();

public int Compare(ReadOnlyMemory<char> x, ReadOnlyMemory<char> y)
Expand Down
22 changes: 22 additions & 0 deletions HostsParser/Settings.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,35 @@

namespace HostsParser
{
/// <summary>
/// Object used at runtime to represent settings specified in appsettings.json.
/// </summary>
/// <param name="HostsBased">Settings used for processing a hosts formatted source.</param>
/// <param name="AdBlockBased">Settings used for processing a AdBlock formatted source.</param>
/// <param name="HeaderLines"><para>Setting to indicate if extra filtering should be performed.</para>
/// <para>If <see langword="true"/>, the program will check each element in the result against each other
/// and remove any entry that would be blocked by a more general entry.</para>
/// </param>
/// <param name="KnownBadHosts">Defines a set of lines that will be inserted at
/// the top of the generated file, for example copyright.</param>
/// <param name="ExtraFiltering">Array of unwanted hosts. These entries will be added to the result
/// if they're not covered by the AdBlockBased entries.
/// You can also add generalized hosts to reduce the number of entries in final results.
/// <example>HostsBased results might contain a.baddomain.com and b.baddomain.com, adding baddomain.com
/// will remove the sub domain entries and block baddomain.com and all of its subdomains.</example>
/// </param>
internal sealed record Settings(
SourceEntry HostsBased,
SourceEntry AdBlockBased,
string[] HeaderLines,
string[] KnownBadHosts,
bool ExtraFiltering);

/// <summary>
/// Settings used for processing a hosts or AdBlock formatted source.
/// </summary>
/// <param name="SourceUri">The <see cref="Uri"/> containing the hosts.</param>
/// <param name="SkipLines">Array of strings that, if present in the result from <see cref="SourceUri"/> will be filtered out.</param>
internal sealed record SourceEntry(
Uri SourceUri,
string[]? SkipLines)
Expand Down

0 comments on commit 81382b2

Please sign in to comment.