Splitting / looping through a string containing parentheses in C #

I am working with the following line:

(SENT (VBP (HPP (HP Vem))(VB kan)(VBP (VB få)(PMP (PM ATP)))(MADP (MAD ?))))

      

I would like to make the following conclusion:

SENT -> VBP -> HPP -> HP
SENT -> VBP -> VB
SENT -> VBP -> VBP -> VB
SENT -> VBP -> VBP -> PMP -> PM
SENT -> VBP -> MADP -> MAD

      

To archive this, I first thought about looping through all the parentheses starting at the beginning, and then deeper and deeper, if any. (maybe a recursive function?)

But since there is actually no function to split by parentheses, I tried to split with (

, and then going into a loop, find )

, for example:

    var row = "(SENT (VBP (HPP (HP Vem))(VB kan)(VBP (VB få)(PMP (PM ATP)))(MADP (MAD ?))))";

    string[] splitP = row.Split('(');

    for (int i = 0; i < splitP.Length; i++ )
    {
        string data = splitP[i];

        // string[] dataSplit = data.Split(')');

        Console.WriteLine(data);
    }

    Console.ReadLine();

      

But as you can see I am stuck and the above does not even represent what I am trying to do because I found out that the logic I was thinking is wrong and it cannot be done like this.

How can I archive this?

Update.

Larger test line:

(SENT (VBP (PPP (PP På)(NNP (NN grundval))(PPP (PP av))(NNP (DTP (DT en))(NN intervju)(PPP (PP efter)(NNP (NN experimentet)))(PPP (PP med)(PCP (DTP (DT de))(PC oinvigda)(VBP (HPP (HP som))(VB gjort)(NNP (JJP (JJ felaktiga))(NN bedömningar)))))))(VB kunde)(PNP (PN man))(VBP (VB dela)(PLP (PL in))(PNP (PN dem))(PPP (PP i)(NNP (RGP (RG tre))(NN grupper)(MIDP (MID :))(KNP (NNP (NN (a)))(PNP (PN de)(VBP (HPP (HP som))(ABP (AB faktiskt))(VB trodde)(SNP (SN att)(VBP (PNP (PN de))(VB bedömt)(ABP (AB riktigt))))))(MIDP (MID ,))(PNP (NNP (NN (b)))(PN de)(VBP (HPP (HP som))(VB trodde)(SNP (SN att)(VBP (DTP (DT de)(JJP (JJ själva)))(VB måste)(VBP (VB ha)(VBP (VB misstagit)(PNP (PN sig))(SNP (SN eftersom)(VBP (ABP (AB inte))(PNP (ABP (AB så))(PN många))(VB kan)(VBP (VB ha)(ABP (AB fel))(PPP (PP mot)(NNP (DTP (DT en))(JJP (JJ enda))(NN person))))))))))))(KN och)(PNP (NNP (NN (c)))(PN de)(KNP (VBP (HPP (HP som))(ABP (AB faktiskt))(VB var)(JJP (JJ medvetna))(PPP (PP om)(SNP (SN att)(VBP (PNP (PN de))(VB angav)(NNP (JJP (JJ felaktiga))(NN bedömningar))))))(KN men)(VBP (HPP (HP som))(ABP (AB inte))(VB ville)(VBP (VB avvika)(PPP (PP från)(NNP (NN gruppen)))))))))))(MADP (MAD .))))

      

+3


source to share


2 answers


Here is another answer I am trying to understand:

public class Class1
{
    public static void Main()
    {
        new Class1().myRec("(SENT (VBP (HPP (HP Vem))(VB kan)(VBP (VB få)(PMP (PM ATP)))(MADP (MAD ?))))", null);
    }


    public void myRec(string input, string start)
    {
        if (input == null)
            return;
        if (input[0] != '(' || input[input.Length - 1] != ')')
        {
            Console.WriteLine(start);
            return;
        }
        int count = 0;
        List<string> subStrs = new List<string>();

        input = input.Remove(0, 1);
        input = input.Remove(input.Length - 1, 1);
        int i = input.IndexOf(' ');

        string nextInput = i>0?input.Substring(0, i):input;

        if (start != null)
            start = start + " -> " + nextInput;
        else
            start = nextInput;

        input = input.Remove(0, i + 1);

        string tempStr = "";
        for (int j = 0; j < input.Length; j++)
        {
            tempStr += input[j];
            if (input[j] == '(')
                count++;
            else if (input[j] == ')')
            {
                count--;
                if (count == 0)
                {
                    subStrs.Add(tempStr);
                    tempStr = "";
                }
            }
        }
        if (subStrs.Count == 0)
            subStrs.Add(tempStr);

        subStrs.ForEach(delegate(string it)
        {
            new Class1().myRec(it, start);
        });

    }
}

      

It uses recursion, also only works when your input is correct, I mean you have equals (

and )

. Also, I am not a C # programmer, so I know this code could be greatly improved.

Change replace array to list to make the code more precise.

Edit 2 to make it work for input where it cannot contain some whitespace, such as the OP's new larger test case. I am doing some changes:

replace this in my code:



        if (start != null)
            start = start + " -> " + input.Substring(0, i);
        else
            start = input.Substring(0, i);

      

with this:

    string nextInput = i>0?input.Substring(0, i):input;

    if (start != null)
        start = start + " -> " + nextInput;
    else
        start = nextInput;

      

(i do it already)

and here is the result:

SENT -> VBP -> PPP -> PP
SENT -> VBP -> PPP -> NNP -> NN
SENT -> VBP -> PPP -> PPP -> PP
SENT -> VBP -> PPP -> NNP -> DTP -> DT
SENT -> VBP -> PPP -> NNP -> NN
SENT -> VBP -> PPP -> NNP -> PPP -> PP
SENT -> VBP -> PPP -> NNP -> PPP -> NNP -> NN
SENT -> VBP -> PPP -> NNP -> PPP -> PP
SENT -> VBP -> PPP -> NNP -> PPP -> PCP -> DTP -> DT
SENT -> VBP -> PPP -> NNP -> PPP -> PCP -> PC
SENT -> VBP -> PPP -> NNP -> PPP -> PCP -> VBP -> HPP -> HP
SENT -> VBP -> PPP -> NNP -> PPP -> PCP -> VBP -> VB
SENT -> VBP -> PPP -> NNP -> PPP -> PCP -> VBP -> NNP -> JJP -> JJ
SENT -> VBP -> PPP -> NNP -> PPP -> PCP -> VBP -> NNP -> NN
SENT -> VBP -> VB
SENT -> VBP -> PNP -> PN
SENT -> VBP -> VBP -> VB
SENT -> VBP -> VBP -> PLP -> PL
SENT -> VBP -> VBP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> PP
SENT -> VBP -> VBP -> PPP -> NNP -> RGP -> RG
SENT -> VBP -> VBP -> PPP -> NNP -> NN
SENT -> VBP -> VBP -> PPP -> NNP -> MIDP -> MID
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> NNP -> NN -> a
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> HPP -> HP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> SN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> MIDP -> MID
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> NNP -> NN -> b
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> HPP -> HP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> SN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> DTP -> DT
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> DTP -> JJP -> JJ
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> SN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> PNP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VBP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VBP -> PPP -> PP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VBP -> PPP -> NNP -> DTP -> DT
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VBP -> PPP -> NNP -> JJP -> JJ
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> VBP -> SNP -> VBP -> VBP -> VBP -> SNP -> VBP -> VBP -> PPP -> NNP -> NN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> KN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> NNP -> NN -> c
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> HPP -> HP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> JJP -> JJ
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> PPP -> PP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> PPP -> SNP -> SN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> PPP -> SNP -> VBP -> PNP -> PN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> PPP -> SNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> PPP -> SNP -> VBP -> NNP -> JJP -> JJ
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> PPP -> SNP -> VBP -> NNP -> NN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> KN
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> HPP -> HP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> ABP -> AB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> VBP -> VB
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> VBP -> PPP -> PP
SENT -> VBP -> VBP -> PPP -> NNP -> KNP -> PNP -> KNP -> VBP -> VBP -> PPP -> NNP -> NN
SENT -> VBP -> MADP -> MAD

      

+1


source


I would do data tokenization, iterate tokens to build ASTs that represent the model according to whatever rules you have, and then overwrite the AST.

Edit: Something isn't quite right here, but I don't have time to debug it.



using System;
using System.Collections.Generic;
static class Program
{
    static void Main()
    {
        string input = "(SENT (VBP (HPP (HP Vem))(VB kan)(VBP (VB få)(PMP (PM ATP)))(MADP (MAD ?))))";
        Node root = new Node(), current = root;
        var ast = new Stack<Node>();
        foreach (var token in Tokenize(input))
        {
            switch(token)
            {
                case "(":
                    // new sub-node
                    Node next = new Node();
                    current.Children.Add(next);
                    ast.Push(current);
                    current = next;                    
                    break;
                case ")":
                    // go back a level
                    current = ast.Pop();
                    break;
                case " ":
                    // nothing
                    break;
                default:
                    if (current.Value == null)
                        current.Value = token;
                    else
                        current.Args.Add(token);
                    break;

            }
        }
        if (ast.Count != 0) throw new InvalidOperationException("unbalanced");

        Queue<Node> ancestors = new Queue<Node>();
        Write(ancestors, root);
    }

    private static void Write(Queue<Node> ancestors, Node node)
    {
        if(node.Children.Count == 0)
        {
            foreach(var parent in ancestors)
            {
                if (!string.IsNullOrWhiteSpace(parent.Value))
                {
                    Console.Write(parent.Value);
                    Console.Write(" -> ");
                }
            }
            Console.WriteLine(node.Value);
        }
        else
        {
            ancestors.Enqueue(node);
            foreach (var child in node.Children)
            {
                Write(ancestors, child);
            }
            ancestors.Dequeue();
        }
    }
    class Node
    {
        public string Value { get; set; }
        private readonly List<Node> children = new List<Node>();
        private readonly List<string> args = new List<string>();
        public List<Node> Children { get { return children; } }
        public List<string> Args { get { return args; } }
    }
    static IEnumerable<string> Tokenize(string value)
    {
        if (string.IsNullOrWhiteSpace(value)) yield break;

        int last = -1, next;
        char[] splits = {'(', ')', ' '};
        while((next = value.IndexOfAny(splits, ++last)) >= 0)
        {
            if (last != next) yield return value.Substring(last, next - last);
            yield return value[next].ToString();
            last = next;
        }
    }
}

      

0


source







All Articles