PostgreSQL: regex replaces first level square brackets with curly braces

I have data in a PostgreSQL column of type TEXT that I need to do character substitution. Specifically, I want to replace the square brackets with curly braces. The trick is that I want to replace the parentheses that are no more than two levels long, if you include the main parentheses supplied. These lines can be quite long, so I think regex is probably the way to (function regexp_replace

), but I'm not very good at regex expressions. Here's an example of one such value:

[0,0,0,[12,2],0,0,[12,[1,2,3]],12,0,[12,2,[2]],12,0,12,0,0]

      

So I would like this line to change to:

{0,0,0,{12,2},0,0,{12,[1,2,3]},12,0,{12,2,[2]},12,0,12,0,0}

      

Thanks in advance!

+3


source to share


3 answers


It will hurt with PostgreSQL style Regex , maybe no recursion .

For a maximum of 2 levels of nested depth, check if the following double swap works (can't check it)

regexp_replace(
  regexp_replace('str', E'\\[(([^][]|\\[([^][]|\\[[^][]*\\])*\\])*)\\]', E'{\\1}', 'g')
, E'\\[(([^][]|\\[([^][]|\\[[^][]*\\])*\\])*)\\]', E'{\\1}', 'g')

      

The idea is to match and replace the outer one []

in two passes. See an example in regex101:

pass 1 : {0,0,0,[12,2],0,0,[12,[1,2,3]],12,0,[12,2,[2]],12,0,12,0,0}


pass 2 :{0,0,0,{12,2},0,0,{12,[1,2,3]},12,0,{12,2,[2]},12,0,12,0,0}

\[[^][]*\]

(unescaped) matches an instance [...]

  • \[

    open square bracket
  • [^][]*

    followed by any number of characters that are not square brackets
  • \]

    followed by a closing square bracket

Note that if the string always starts with [

, ends with , ]

and represents one instance of level 0 (no limit on ][

), the first / inner regexp_replace

can also be done by replacing [

at the beginning ^

and ]

end $

: E'^\\[(.*)\\]$'

withE'{\\1}'




To add nesting here an example with max 4 levels of depth:

\[([^][]|    # outer
\[([^][]|    # lvl 1
\[([^][]|    # lvl 2
\[([^][]|    # lvl 3
\[[^][]*\]   # lvl 4
)*\]
)*\]
)*\]
)*\]

      

By wrapping what's inside the outer []

in a capture group , the template for the 4 levels becomes:

\[(([^][]|\[([^][]|\[([^][]|\[([^][]|\[[^][]*\])*\])*\])*\])*)\]

      

regex_replace

Additional shielding is probably required for use with[]

\\[(([^][]|\\[([^][]|\\[([^][]|\\[([^][]|\\[[^][]*\\])*\\])*\\])*\\])*)\\]

      

This can be used as the first pattern in two passes and replaced with E'{\\1}'

+3


source


It's ugly, but it works (and avoids the complexities of regexp;) I hope I have all the corner cases ...



CREATE OR REPLACE FUNCTION replbracket( _source text ) returns text
AS $func$
DECLARE
        pos_end INTEGER;
        pos_begin INTEGER;
        level INTEGER;
        result text;
BEGIN
        result = '' ;
        level = 0;
LOOP
        pos_begin = position ( '[' IN _source );
        pos_end = position ( ']' IN _source );
        -- raise notice 'Source=% Result=% Begin = % End=%'
                -- ,_source, result, pos_begin, pos_end;

        if (pos_begin < 1 AND pos_end < 1) THEN EXIT ;
        elsif (pos_begin < 1 ) THEN pos_begin =  pos_end + 1 ;
        elsif (pos_end < 1 ) THEN pos_end =  pos_begin + 1 ;
        end if;
        if (pos_begin < pos_end) THEN
                result = result || LEFT(_source, pos_begin-1);
                level = level + 1;
                if (level <= 2) THEN result = result || '{'; else result = result || '['; end if;
                _source = SUBSTR(_source, pos_begin+1);
        ELSE
                result = result || LEFT(_source, pos_end-1);
                level  = level - 1;
                if (level < 2) THEN result = result || '}'; else result = result || ']'; end if;
                _source = SUBSTR(_source, pos_end+1);
        END IF;
END LOOP;
        result = result || _source ;
        return result;
END

$func$ LANGUAGE plpgsql;

      

+3


source


Just for kicks, here's a solution entirely in SQL. It uses CTEs for innovative clarity, but you can use subqueries in FROM instead, there is no recursive use of CTEs.

Edit . Added a simplified, faster SQL version, a Pl / Python version, and a C version. C - slightly faster - about 250 times faster.

create or replace function repl(text) 
returns text 
language sql
as $$
with 
chars(pos, ch) as (
    -- In PostgreSQL 9.4 this can be replaced with an UNNEST ... WITH ORDINALITY
    -- it turns the string into a list of chars accompanied by their position within
    -- the string.
    select row_number() OVER (), ch
    from regexp_split_to_table($1,'') ch
),
nesting(ch, pos, lvl) as (
    -- This query then determines how many levels of nesting of [s and ]s are
    -- in effect for each character.
    select ch, pos, 
        sum(case ch when '[' then 1 when ']' then -1 else 0 end) OVER (ORDER BY pos) 
        from chars
),
transformed(ch, pos) as (
    -- and this query transforms [s to {s or ]s to }s if the nesting
    -- level is appropriate. Note that we use one less level of nesting
    -- for closing brackets because the closing bracket it self has already
    -- reduced the nesting level.
    select 
      case
        when ch = '[' and lvl <= 2 then '{' 
        when ch = ']' and lvl <= 1 then '}' 
        else ch
      end,
      pos
    from nesting
)
-- Finally, reconstruct the new string from the (char, position) tuples
select 
  string_agg(ch, '' order by pos)
from transformed;
$$;

      

However, it is slower than other solutions.

  • Johnny 5's regexp solution takes 450ms for 10,000 iterations.
  • wildplasser replbracket

    takes 950ms for 10,000 iterations.
  • This CTE solution takes 2050ms for 10,000 iterations.

Getting rid of the CTE and using it unnest ... with ordinality

speeds it up to about 1400ms:

create or replace function repl(text) returns text language sql volatile as
$$
    select
      string_agg(ch, '' order by pos)
    from (
        select
          case
            when ch = '[' and sum(case ch when '[' then 1 when ']' then -1 else 0 end) OVER (ORDER BY pos) <= 2 then '{'
            when ch = ']' and sum(case ch when '[' then 1 when ']' then -1 else 0 end) OVER (ORDER BY pos) <= 1 then '}'
            else ch
          end,
          pos
        from unnest(regexp_split_to_array($1,'')) with ordinality as chars(ch, pos)
    ) as transformed(ch, pos)
$$;

      

If you want fast, use the correct procedural language - or C. In PL / Python2:

create or replace function replpy(instr text) returns text language plpythonu as $$
def pyrepl(instr):
    level=0
    for ch in instr:
        if ch == '[':
                level += 1
                if level <= 2:
                        yield '{'
                else:
                        yield '['
        elif ch == ']':
                if level <= 2:
                        yield '}'
                else:
                        yield ']'
                level -= 1
        else:
                yield ch

return ''.join(pyrepl(instr))
$$;

      

takes 160 ms.

OK, whipping a dead horse, let's do it in C. The full source code as an extension is here , but here's the .c file:

#include "postgres.h"
#include "fmgr.h"
#include "utils/builtins.h"

PG_MODULE_MAGIC;

PG_FUNCTION_INFO_V1(replc);
Datum replc(PG_FUNCTION_ARGS);

PGDLLEXPORT Datum
replc(PG_FUNCTION_ARGS)
{
    /* Set `buf` to a palloc'd copy of the input string, deTOASTed if needed */
    char * const buf = text_to_cstring(PG_GETARG_TEXT_PP(0));
    char * ch = buf;
    int depth = 0;


    while (*ch != '\0')
    {
        switch (*ch)
        {
            case '[':
                depth++;
                if (depth <= 2)
                    *ch = '{';
                break;
            case ']':
                if (depth <= 2)
                    *ch = '}';
                depth--;
                break;
        }
        ch++;
    }
    if (depth != 0)
        ereport(WARNING,
                (errmsg("Opening and closing []s did not match, got %d extra [s", depth)));

    PG_RETURN_DATUM(CStringGetTextDatum(buf));
}

      

Execution time: 8ms for 10,000 iterations. Good enough, it's 250x faster than the original, and that comes with the overhead of a forced subquery.

+3


source







All Articles