Pythonic way to get case sensitive path?

Question

Pythonic way to get case sensitive path?

I was wondering if there is a faster way to implement a function that returns a case sensitive path in python. One solution I came up with to work with Linux and Windows, but requires me to repeat os.listdir, which can be slow.

This solution is great for an application and context that doesn't require a lot of speed:

def correctPath(start, path):
    'Returns a unix-type case-sensitive path, works in windows and linux'
    start = unicode(start);
    path = unicode(path);
    b = '';
    if path[-1] == '/':
        path = path[:-1];
    parts = path.split('\\');
    d = start;
    c = 0;
    for p in parts:
        listing = os.listdir(d);
        _ = None;
        for l in listing:
            if p.lower() == l.lower():
                if p != l:
                    c += 1;
                d = os.path.join(d, l);
                _ = os.path.join(b, l);
                break;
        if not _:
            return None;
        b = _;

    return b, c; #(corrected path, number of corrections)

>>> correctPath('C:\\Windows', 'SYSTEM32\\CmD.EXe')
(u'System32\\cmd.exe', 2)

However, this will not be so fast when the context is collecting filenames from a large database with over 50,000+.

One way would be to create a dict tree for each directory. Map the dict tree to the directory parts of the path, and if the key fails, run the os.listdir command to find and create a dict entry for the new directory and remove unused parts or keep a variable count as a way to assign a "lifetime" for each directory.

+3

python linux windows directory path

chaz 08 Feb 13 at 5:12 am

source to share

2 answers

mfitzp · Answer 1 · 2013-04-13T23:45:26+0000

Below is a small rewrite of native code with three modifications: validating the filename before matching, processing the list to lowercase before testing, using the index to find the matching "true case" file.

def corrected_path(start, path):
    '''Returns a unix-type case-sensitive path, works in windows and linux'''
    start = unicode(start)
    path = unicode(path)
    corrected_path = ''
    if path[-1] == '/':
        path = path[:-1]
    parts = path.split('\\')
    cd = start
    corrections_count = 0

    for p in parts:
        if not os.path.exists(os.path.join(cd,p)): # Check it not correct already
            listing = os.listdir(cd)

            cip = p.lower()
            cilisting = [l.lower() for l in listing]

            if cip in cilisting:
                l = listing[ cilisting.index(cip) ] # Get our real folder name
                cd = os.path.join(cd, l)
                corrected_path = os.path.join(corrected_path, l)
                corrections_count += 1
            else:
                return False # Error, this path element isn't found
        else:
            cd = os.path.join(cd, p)
            corrected_path = os.path.join(corrected_path, p)

    return corrected_path, corrections_count

I'm not sure if this will be much faster, although testing will be a little less, plus the "already correct" catch in the beginning may help.

mfitzp · Answer 2 · 2013-04-14T16:04:46+0000

Extended version with case-insensitive caching to pull out the corrected path:

import os,re

def corrected_paths(start, pathlist):
    ''' This wrapper function takes a list of paths to correct vs. to allow caching '''

    start = unicode(start)
    pathlist = [unicode(path[:-1]) if path[-1] == '/' else unicode(path) for path in pathlist ]

    # Use a dict as a cache, storing oldpath > newpath for first-pass replacement
    # with path keys from incorrect to corrected paths
    cache = dict() 
    corrected_path_list = []
    corrections_count = 0
    path_split = re.compile('(/+|\+)')

    for path in pathlist:
        cd = start
        corrected_path = ''
        parts = path_split.split(path)

        # Pre-process against the cache
        for n,p in enumerate(parts):
            # We pass *parts to send through the contents of the list as a series of strings
            uncorrected_path= os.path.join( cd, *parts[0:len(parts)-n] ).lower() # Walk backwards
            if uncorrected_path in cache:
                # Move up the basepath to the latest matched position
                cd = os.path.join(cd, cache[uncorrected_path])
                parts = parts[len(parts)-n:] # Retrieve the unmatched segment
                break; # First hit, we exit since we're going backwards

        # Fallback to walking, from the base path cd point
        for n,p in enumerate(parts):

            if not os.path.exists(os.path.join(cd,p)): # Check it not correct already
            #if p not in os.listdir(cd): # Alternative: The above does not work on Mac Os, returns case-insensitive path test

                listing = os.listdir(cd)

                cip = p.lower()
                cilisting = [l.lower() for l in listing]

                if cip in cilisting:

                    l = listing[ cilisting.index(cip) ] # Get our real folder name
                    # Store the path correction in the cache for next iteration
                    cache[ os.path.join(cd,p).lower() ] = os.path.join(cd, l)
                    cd = os.path.join(cd, l)
                    corrections_count += 1

                else:
                    print "Error %s not in folder %s" % (cip, cilisting)
                    return False # Error, this path element isn't found

            else:
                cd = os.path.join(cd, p)

        corrected_path_list.append(cd)

    return corrected_path_list, corrections_count

In the example run for a set of paths, this greatly reduces the number of listdirs (this obviously depends on how similar your paths are):

corrected_paths('/Users/', ['mxF793/ScRiPtS/meTApaTH','mxF793/ScRiPtS/meTApaTH/metapAth/html','mxF793/ScRiPtS/meTApaTH/metapAth/html/css','mxF793/ScRiPts/PuBfig'])
([u'/Users/mxf793/Scripts/metapath', u'/Users/mxf793/Scripts/metapath/metapath/html', u'/Users/mxf793/Scripts/metapath/metapath/html/css', u'/Users/mxf793/Scripts/pubfig'], 14)
([u'/Users/mxf793/Scripts/metapath', u'/Users/mxf793/Scripts/metapath/metapath/html', u'/Users/mxf793/Scripts/metapath/metapath/html/css', u'/Users/mxf793/Scripts/pubfig'], 5)

Along the way I realized that on Mac OSX Python returns paths as if they were case insensitive, so the existence test always succeeds. In this case, the listdir can be moved to replace it.

Pythonic way to get case sensitive path?

More articles: