Multiple regex string patterns (different fields)

I am trying to extract text from a text document with the following format and insert the data into a SQL database.

Word document

Name of House: Aasleagh Lodge
Townland: Srahatloe
Near: Killary Harbour, Leenane
Status/Public Access: maintained, private fishing lodge
Date Built: 1838-1850, burnt 1923, rebuilt 1928

      

Source

        var wordApp = new Microsoft.Office.Interop.Word.Application();
        var wordDoc = wordApp.Documents.Open(@"C:\Users\mhoban\Documents\Book.docx");
        var txt = wordDoc.Content.Text;

        var regex = new Regex(@"(Name of House\: )(.+?)[\r\n]");

        var allMatches = regex.Matches(txt);
        foreach (Match match in allMatches)
        {
            var nameValue = match.Groups[2].Value;
            var townValue = match.Groups[2].Value;

            SqlConnection con = new SqlConnection(ConfigurationManager.ConnectionStrings["ConnectionString"].ToString());
            SqlCommand com = new SqlCommand();

            com.CommandText = "INSERT INTO Houses (Name, Townland) VALUES (@name, @town)";

            com.Parameters.Add("@name", SqlDbType.NVarChar).SqlValue = nameValue;
            com.Parameters.Add("@town", SqlDbType.NVarChar).SqlValue = townValue;

            com.Connection = con;

            con.Open();

            com.ExecuteNonQuery();

            con.Close();
        }

      

This works great as soon as I write the code to insert other text fields like this line

var regex = new Regex(@"(Name of House\: )(.+?)[\r\n]");

      

Inserts the name of the house in this case "Aasleagh Lodge", but how would I write this line to insert the city?

I tried replacing "Townland" in the regex with the field name I want, but I end up with unique entries, each with only one column value.

Is there a way that I could insert data at the same time, perhaps using a list or something similar to prevent this from happening.

New source code

var wordApp = new Microsoft.Office.Interop.Word.Application();
            var wordDoc = wordApp.Documents.Open(@"C:\Users\mhoban\Documents\Book.docx");
            var txt = wordDoc.Content.Text;

            using (var sr = new StringReader(txt))
            {
                var s = string.Empty;
                var nameValue = new StringBuilder();
                var townValue = new StringBuilder();
                while ((s = sr.ReadLine()) != null)
                {
                    if (s.StartsWith("Name of House"))
                    {
                        nameValue.Append(s.Split(new[] { ':' })[1].Trim());
                    }
                    else if (s.StartsWith("Townland"))
                    {
                        townValue.Append(s.Split(new[] { ':' })[1].Trim());
                    }

                    if (nameValue.Length > 0 && townValue.Length > 0)
                    {
                        SqlConnection con = new SqlConnection(ConfigurationManager.ConnectionStrings["ConnectionString"].ToString());
                        SqlCommand com = new SqlCommand();

                        com.CommandText = "INSERT INTO Houses (Name, Townland) VALUES (@name, @town)";
                        com.CommandText = "INSERT INTO Houses (Name) VALUES (@name)";

                        com.Parameters.Add("@name", SqlDbType.NVarChar).SqlValue = nameValue;
                        com.Parameters.Add("@town", SqlDbType.NVarChar).SqlValue = townValue;

                        com.Connection = con;

                        con.Open();

                        com.ExecuteNonQuery();

                        con.Close();

                        nameValue.Clear(); townValue.Clear();
                    }
                }
            }

      

Database fields

[Id]          NCHAR (10)     NULL,
[Name]        NVARCHAR (MAX) NULL,
[Townland]    NVARCHAR (MAX) NULL,
[Near]        NVARCHAR (MAX) NULL,
[Status]      NVARCHAR (MAX) NULL,
[Built]       NVARCHAR (MAX) NULL,
[Description] NVARCHAR (MAX) NULL,
[Families]    NVARCHAR (MAX) NULL,
[Images]      IMAGE          NULL

      

+3


source to share


4 answers


Here is a solution without regex. You really don't need it.



var txt = "Name of House: Aasleagh Lodge\r\nTownland: Srahatloe\r\nNear: Killary Harbour, Leenane\r\nStatus/Public Access: maintained, private fishing lodge\r\nDate Built: 1838-1850, burnt 1923, rebuilt 1928\r\nName of House: House of Lan\r\nTownland: Another town land\r\nNear: Killary Harbour, Leenane\r\nStatus/Public Access: maintained, private fishing lodge\r\nDate Built: 1838-1850, burnt 1923, rebuilt 1928\r\nName of House: New Lodge\r\nTownland: NewTownLand\r\nNear: Killary Harbour, Leenane\r\nStatus/Public Access: maintained, private fishing lodge\r\nDate Built: 1838-1850, burnt 1923, rebuilt 1928";
using (var sr = new StringReader(txt))
{
   var s = string.Empty;
   var nameOfHouse = new StringBuilder();
   var townland = new StringBuilder();
   while ((s = sr.ReadLine()) != null)
   {
      if (s.StartsWith("Name of House"))
      {
          nameOfHouse.Append(s.Split(new[] {':'})[1].Trim());
      }
      else if (s.StartsWith("Townland"))
      {
           townland.Append(s.Split(new[] { ':' })[1].Trim());
      }

      if (nameOfHouse.Length > 0 && townland.Length > 0)
      { 
          // INSERT THE VALUES AND RESET THEM
          nameOfHouse.Clear(); townland.Clear();
      }
   }
}

      

+1


source


You can use the following regex:

(.*?\: )(.+?)[\r\n]

      

See DEMO



Also .. you could replace [\r\n]

in your regex $

to match the latter, for example where \r or \n

are optional.

i.e:

var regex = new Regex(@"^(.*?\: )(.+?)$");

      

0


source


Yes it is possible. However, simply creating a regex only solves half of the problem, because you also need to know which database column each value is displayed in.

Here is the general approach I would take:

  • You have something that defines the name of each parameter that can appear in the file and the corresponding column in the database. It might just be a dictionary in your code, but more mature design involves including it in some kind of external configuration.

  • Use a simple String.Split

    based on :

    each line to put all key / value pairs into the dictionary (regex overkill here).

  • Create an insert statement based on the data in the first two steps above.

0


source


try it

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.IO;
using System.Data;
using System.Data.SqlClient;

namespace ConsoleApplication21
{
    class Program
    {
        const string FILENAME = @"c:\temp\test.txt";
        static void Main(string[] args)
        {
            StreamReader reader = new StreamReader(FILENAME);
            string inputLine = "";
            List<TakenBMI> takenBMIs = new List<TakenBMI>();
            TakenBMI newTakenBMI = null;
            while ((inputLine = reader.ReadLine()) != null)
            {
                inputLine = inputLine.Trim();
                if (inputLine.Length > 0)
                {
                    string[] inputArray = inputLine.Split(new char[] { ':' });
                    switch (inputArray[0].Trim())
                    {
                        case "Name of House":
                            newTakenBMI = new TakenBMI();
                            takenBMIs.Add(newTakenBMI);
                            newTakenBMI.Name_of_House = inputArray[1].Trim();
                            break;
                        case "Townland":
                            newTakenBMI.Townland = inputArray[1].Trim();
                            break;
                        case "Near":
                            newTakenBMI.Near = inputArray[1].Trim();
                            break;
                        case "Status/Public Access":
                            newTakenBMI.Status_Public_Access = inputArray[1].Trim();
                            break;
                        case "Date Built":
                            newTakenBMI.Date_Built = inputArray[1].Trim();
                            break;
                    }
                }

            }

            reader.Close();

            SqlConnection con = new SqlConnection(ConfigurationManager.ConnectionStrings["ConnectionString"].ToString());
            con.Open();
            string SQL = "INSERT INTO Houses (Name, Townland, Near, Status, Built)" +
                "VALUES ('@name', '@town', '@near', '@status', '@built')";

            SqlCommand com = new SqlCommand(SQL,con);
           

            com.Parameters.Add("@name", SqlDbType.NVarChar);
            com.Parameters.Add("@town", SqlDbType.NVarChar);
            com.Parameters.Add("@near", SqlDbType.NVarChar);
            com.Parameters.Add("@status", SqlDbType.NVarChar);
            com.Parameters.Add("@built", SqlDbType.NVarChar);
 
            foreach (TakenBMI takenBMI in takenBMIs)
            {
                com.Parameters["@name"].Value = takenBMI.Name_of_House ;
                com.Parameters["@town"].Value = takenBMI.Townland;
                com.Parameters["@near"].Value = takenBMI.Near;
                com.Parameters["@status"].Value = takenBMI.Status_Public_Access;
                com.Parameters["@built"].Value = takenBMI.Date_Built;
               
                com.ExecuteNonQuery();
            }

        }
    }
    public class TakenBMI
    {
        public string Name_of_House { get; set; }
        public string Townland { get; set; }
        public string Near { get; set; }
        public string Status_Public_Access { get; set; }
        public string Date_Built { get; set; }
    }
}
      

Run code


0


source







All Articles