Longest common substring in large text

Question

Longest common substring in large text

I have this assignment for a school that asks us to write code to find the longest common substring. I did this, but it only works on text that is not that big and is asked to find a common substring for Moby Dick and War And Peace. If you could point me in the right direction on what I am doing wrong, I would appreciate it. The compiler complains that the error lies in the substring method of the MyString class when I call it to create the SuffixArray, but idk why its saying is too big giving me an outofmemory

package datastructuresone;

import java.io.File;
import java.io.FileNotFoundException;
import java.util.Arrays;
import java.util.Scanner;


 class SuffixArray
 {

private final MyString[] suffixes;
private final int N;

public SuffixArray(String s)
{
    N = s.length();
    MyString snew = new MyString(s);
    suffixes = new MyString[N];
    for (int i = 0; i < N; i++)
    {
        suffixes[i] = snew.substring(i);
    }
    Arrays.sort(suffixes);
}

public int length()
{
    return N;
}

public int index(int i)
{
    return N - suffixes[i].length();
}

public MyString select(int i)
{
    return suffixes[i];
}

// length of longest common prefix of s and t
private static int lcp(MyString s, MyString t)
{
    int N = Math.min(s.length(), t.length());
    for (int i = 0; i < N; i++)
    {
        if (s.charAt(i) != t.charAt(i))
        {
            return i;
        }
    }
    return N;
}

// longest common prefix of suffixes(i) and suffixes(i-1)
public int lcp(int i)
{
    return lcp(suffixes[i], suffixes[i - 1]);
}

// longest common prefix of suffixes(i) and suffixes(j)
public int lcp(int i, int j)
{
    return lcp(suffixes[i], suffixes[j]);

}
}

public class DataStructuresOne
{

public static void main(String[] args) throws FileNotFoundException
{
    Scanner in1 = new Scanner(new File("./build/classes/WarAndPeace.txt"));
    Scanner in2 = new Scanner(new File("./build/classes/MobyDick.txt"));

    StringBuilder sb = new StringBuilder();
    StringBuilder sb1 = new StringBuilder();

    while (in1.hasNextLine())
    {
        sb.append(in1.nextLine());

    }
    while (in2.hasNextLine())
    {
        sb1.append(in2.nextLine());
    }

    String text1 = sb.toString().replaceAll("\\s+", " ");
    String text2 = sb1.toString().replaceAll("\\s+", " ");

    int N1 = text1.length();
    int N2 = text2.length();

    SuffixArray sa = new SuffixArray(text1 + "#" + text2);
    int N = sa.length();

    String substring = "";
    for (int i = 1; i < N; i++)
    {

        // adjacent suffixes both from second text string
        if (sa.select(i).length() <= N2 && sa.select(i - 1).length() <= N2)
        {
            continue;
        }

        // adjacent suffixes both from first text string
        if (sa.select(i).length() > N2 + 1 && sa.select(i - 1).length() > N2 + 1)
        {
            continue;
        }

        // check if adjacent suffixes longer common substring
        int length = sa.lcp(i);
        if (length > substring.length())
        {
            substring = sa.select(i).toString().substring(0, length);
            System.out.println(substring + " ");
        }
    }

    System.out.println("The length of the substring " + substring.length() + "length on    first N " + N1 + " length of Second N " + N2
            + "The length of the array sa: " + N);
   System.out.println("'" + substring + "'");

   final class MyString implements Comparable<MyString>
{

public MyString(String str)
{
    offset = 0;
    len = str.length();
    arr = str.toCharArray();
}

public int length()
{
    return len;
}

public char charAt(int idx)
{
    return arr[ idx + offset];
}

public int compareTo(MyString other)
{
    int myEnd = offset + len;
    int yourEnd = other.offset + other.len;
    int i = offset, j = other.offset;

    for (; i < myEnd && j < yourEnd; i++, j++)
    {
        if (arr[ i] != arr[ j])
        {
            return arr[ i] - arr[ j];
        }
    }

    // reached end.  Who got there first?
    if (i == myEnd && j == yourEnd)
    {
        return 0;   // identical strings
    }
    if (i == myEnd)
    {
        return -1;
    } else
    {
        return +1;
    }
}


public MyString substring(int beginIndex, int endIndex)
{
    return new MyString(arr, beginIndex + offset, endIndex - beginIndex);
}

public MyString substring(int beginIndex)
{
    return substring(beginIndex, offset + len);
}

public boolean equals(Object other)
{
    return (other instanceof MyString) && compareTo((MyString) other) == 0;
}

public String toString()
{
    return new String(arr, offset, len);
}

private MyString(char[] a, int of, int ln)
{
    arr = a;
    offset = of;
    len = ln;
}
private char[] arr;
private int offset;
private int len;
}

+3

java

user1736164 23 jan. 13 at 23:34

source to share

4 answers

femtoRgon · Answer 1 · 2013-01-23T23:48:21+0000

Here:

for (int i = 0; i < N; i++)
{
    suffixes[i] = snew.substring(i);
}

You are trying to keep not only the whole long string, but the whole string - 1 letter, and the whole string - 2 letters, etc. They are all kept separately.

If your string contains only 10 letters, you should store 55 characters in total in 10 different lines.

In 1000 characters, you store 500500 characters.

More generally, you will have to handle length * (length + 1) / 2 characters.

Just for fun, I don't know how many characters are in War and Peace, but with around 1250 pages, the typical word / page estimate is 250, and the average word is about 5 characters long -:

(1250 * 250 * 5) * (1250 * 250 * 5 + 1) / 2 = 1.2207039 * 10 ^ 12 characters.

The size of a char in memory is 2 bytes, so you are looking at about 2.22TB (versus 1.49MB for the novel text only).

Lee meador · Answer 2 · 2013-01-23T23:45:57+0000

I count at least 3 copies of both texts in the first few lines of code. Here are some ideas

convert spaces as you read each line, not after they are huge lines. Don't forget about the cases of spaces at the front and end of the line.
create your class MyString using StringBuilder as base instead of String. Do whatever you are looking for inside StringBuilder with its own methods if you can.
don't fetch more lines than you need.

Have a look at the -Xmx java runtime option and set the heap larger than the default. You will have to do this because I have not memorized it. Just note that -Xmx = 1024M needs an M at the end. (Look at the file size to see how large the two books are.)

Patashu · Answer 3 · 2013-01-23T23:57:19+0000

When you create MyString, you call arr = str.toCharArray();

, which creates a new copy of the string character data. But in Java, a string is immutable - so why not keep a reference to the string instead of a copy of its data?

You build each suffix at once, but you only refer to one (well, two) at a time. If you recode your solution to only refer to the suffixes it currently cares about and only construct them when they need them (and lose reference to them afterwards), they can be garbage collected in Java. This will reduce the chance of running out of memory. Compare the memory overhead of storing 2 rows to store hundreds of thousands of rows :)

SpiderPig · Answer 4 · 2013-01-24T05:18:44+0000

I wrote this program in Scala. Maybe you can translate it to Java.

class MyString private (private val string: String, startIndex: Int, endIndex: Int) extends Comparable[MyString] {
  def this(string: String) = this(string, 0, string.length)
  def length() = endIndex-startIndex
  def charAt(i: Int) = {
    if(i >= length) throw new IndexOutOfBoundsException
    string.charAt(startIndex + i)
  }
  def substring(start: Int, end: Int): MyString = {
    if(start < 0 || end > length || end < start) throw new IndexOutOfBoundsException
    new MyString(string, startIndex + start, startIndex + end)
  }
  def substring(start: Int): MyString = substring(start, length)
  def longestCommonSubstring(other: MyString): MyString = {
    var index = 0
    val len = math.min(length, other.length)
    while(index < len && charAt(index) == other.charAt(index)) index += 1
    substring(0, index)
  }
  def compareTo(other: MyString): Int = {
    val len = math.min(length, other.length)
    for(i <- 0 until len) {
      if(charAt(i) > other.charAt(i)) return 1
      if(charAt(i) < other.charAt(i)) return -1
    }
    length-other.length
  }
  def >(other: MyString) = compareTo(other) > 0
  def <(other: MyString) = compareTo(other) < 0
  override def equals(other: Any) = other.isInstanceOf[MyString] && compareTo(other.asInstanceOf[MyString]) == 0
  override def toString() = "\"" + string.substring(startIndex, endIndex) + "\""
}

def readFile(name: String) = new MyString(io.Source.fromFile(name).getLines.mkString(" ").replaceAll("\\s+", " "))
def makeList(str: MyString) = (0 until str.length).map(i => str.substring(i)).toIndexedSeq

val string1 = readFile("WarAndPeace.txt")
val string2 = readFile("MobyDick.txt")

val (list1, list2) = (makeList(string1).sorted, makeList(string2).sorted)

var longestMatch = new MyString("")
var (index1, index2) = (0,0)
while(index1 < list1.size && index2 < list2.size) {
  val lcs = list1(index1).longestCommonSubstring(list2(index2)) 
  if(lcs.length > longestMatch.length) longestMatch = lcs
  if(list1(index1) < list2(index2)) index1 += 1
  else index2 += 1
}

println(longestMatch)

Longest common substring in large text

More articles: