LINQ joins objects from list HashSet, Join vs Dictionary vs HashSet

I have a HashSet where each one stores a T, I wrote a test application that compares various relationship algorithms that I can think of, however I am not very happy with the results I get.

Are there better ways to achieve object relationships and then one time when I test?

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.Diagnostics;

namespace LINQTests
{
    class Program
    {
        static void Main(string[] args)
        {
            HashSet<User> UserTable = new HashSet<User>();
            HashSet<UserProperty> UserPropertyTable = new HashSet<UserProperty>();

            #region Lets create some dummy data.

            Console.WriteLine("Please wait while creating dummy data, this can take a while...");
            Console.WriteLine("");

            int rows = 1000000;
            for(int x = 0; x < rows; x++)
            {
                Random rnd = new Random();

                // Add new user.
                User user = new User(string.Format("Joachim-{0}", x));
                if(!UserTable.Add(user))
                {
                    // throw new Exception.
                }
                else
                {
                    UserProperty age = new UserProperty(user, "Age", rnd.Next(25, 30).ToString());
                    if(!UserPropertyTable.Add(age))
                    {
                        // throw new Exception.
                    }

                    UserProperty sex = new UserProperty(user, "Sex", "Male");
                    if (!UserPropertyTable.Add(sex))
                    {
                        // throw new Exception.
                    }

                    UserProperty location = new UserProperty(user, "Location", "Norway");
                    if (!UserPropertyTable.Add(location))
                    {
                        // throw new Exception.
                    }
                }
            }

            #endregion

            #region Lets do some query tests.

            IEnumerable<User> Users;
            Stopwatch stopwatch = new Stopwatch();
            int matches = 0;

            // Lets find all users who are of age 29.
            Console.WriteLine("Finding all users who are of age 29");
            Console.WriteLine("");
            Console.WriteLine("---------------------------------------------------");
            Console.WriteLine("{0,-20} | {1,6} | {2,9}", "Search Strategy", "Found", "Time");
            Console.WriteLine("---------------------------------------------------");

            // Join test.
            stopwatch.Start();

            Users = (from user in UserTable
                     join property in UserPropertyTable on user.Id equals property.UserId
                     where property.Key == "Age" && property.Value == "29"
                     select user);

            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Joining Tables", matches, stopwatch.ElapsedMilliseconds);

            // Dictionary test.
            stopwatch.Restart();

            var DictionarySearch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t).ToDictionary(x => x.UserId);
            Users = (from t in UserTable where DictionarySearch.ContainsKey(t.Id) select t);

            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Dictionary Contain", matches, stopwatch.ElapsedMilliseconds);

            // HashSet test.
            stopwatch.Restart();

            var HashsetSearch = new HashSet<Guid>(from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId);

            Users = (from t in UserTable where HashsetSearch.Contains(t.Id) select t);

            matches = Users.Count();
            stopwatch.Stop();
            Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "HashSet Contain", matches, stopwatch.ElapsedMilliseconds);

            // Following takes so long that we wont run them!

            //// Array test.
            //stopwatch.Restart();

            //var ArrayMatch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId).ToArray();
            //Users = (from t in UserTable where ArrayMatch.Contains(t.Id) select t);

            //matches = Users.Count();
            //stopwatch.Stop();
            //Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "Array Contain", matches, stopwatch.ElapsedMilliseconds);


            //// List test.
            //stopwatch.Restart();

            //var ListMatch = (from t in UserPropertyTable where t.Key == "Age" && t.Value == "29" select t.UserId).ToList();
            //Users = (from t in UserTable where ListMatch.Contains(t.Id) select t);

            //matches = Users.Count();
            //stopwatch.Stop();
            //Console.WriteLine("{0,-20} | {1,6} | {2,6} ms.", "List Contain", matches, stopwatch.ElapsedMilliseconds);


            Console.WriteLine("---------------------------------------------------");

            #endregion


            Console.WriteLine("");
            Console.WriteLine("Hit return to exit...");
            Console.Read();
        }
    }

    public class User
    {
        public User(string UserName)
        {
            this.Id = Guid.NewGuid();
            this.UserName = UserName;
        }

        public Guid Id { get; set; }
        public string UserName { get; set; }

        public override bool Equals(object obj)
        {
            User other = obj as User;

            if (other == null)
                return false;

            return this.Id == other.Id;
        }

        public override int GetHashCode()
        {
            return Id.GetHashCode();
        }
    }

    public class UserProperty
    {
        public UserProperty(User user, string key, string value)
        {
            this.Id = Guid.NewGuid();
            this.UserId = user.Id;
            this.Key = key;
            this.Value = value;
        }

        public Guid Id { get; private set; }
        public Guid UserId {get; private set;}
        public string Key { get; set; }
        public string Value { get; set; }

        public override bool Equals(object obj)
        {
            UserProperty other = obj as UserProperty;

            if (other == null)
                return false;

            return this.UserId == other.UserId && this.Key == other.Key;
        }

        public override int GetHashCode()
        {
            return string.Format("{0}-{1}", this.UserId, this.Key).GetHashCode();
        }
    }
}

      

+3


source to share


2 answers


This will make linq / join comparable to other methods:

var properties = UserPropertyTable
    .Where(p=>p.Key == "Age" && p.Value == "29")
    .ToArray();

Users = (from user in UserTable
         join property in properties
         on user.Id equals property.UserId
         select user);

      

Here is the fastest (~ 2X) I could achieve:



var filteredUserIds = new HashSet<Guid>(
        UserPropertyTable
            .Where(p=>p.Key == "Age" && p.Value == "29")
            .Select(p=>p.UserId));

Users = (from user in UserTable
         where filteredUserIds.Contains(user.Id)
         select user);

      

With an exit

-------------------------------------------------- -
Search Strategy | Found | Time
-------------------------------------------------- -
My method | 210366 | 157 ms.
Dictionary Contain | 210366 | 325 ms.
HashSet Contain | 210366 | 325 ms.
-------------------------------------------------- -
+1


source


Here are some things you can do:

  • GetHashCode is called for each added item and for each item being examined (how else does the hash table compare the two hash codes?). Equality is called at least once for each probe. Optimize them. Don't use string.Format (!). It has to parse the format string and allocate memory. UserId.GHC() * 37 ^ Name.GHC()

    should do.
  • Don't use LINQ. Write hand loops. LINQ has multiple calls to vcalls or delegates to the item being processed. This adds instructions, stops the processor, and prevents inlining.
  • Can you pre-copy the data structures? Precompute hash tables or sorted lists. Merging two sorted lists can be faster than using a hash table. It is not built into the structure. Lots of hard, customizable write code for joins.
  • Why are you including a predicate property.Key == "Age" && property.Value == "29"

    in dimensions? Will this need to be run for every execution? You can try to optimize this by using integer values ​​instead of strings. Perhaps you can pre-copy the index for UserPropertyTable

    so that you can get matching items at a constant time.


This should give you 1-10x speedup depending on how much you implement. If you need to go even faster, I must ask some questions first. You can often find special tricks that only apply to specific situations.

+1


source







All Articles