muMLE/pattern_matching/patternMatching.py

603 lines
29 KiB
Python

# coding: utf-8
"""
Author: Sten Vercamman
Univeristy of Antwerp
Example code for paper: Efficient model transformations for novices
url: http://msdl.cs.mcgill.ca/people/hv/teaching/MSBDesign/projects/Sten.Vercammen
The main goal of this code is to give an overview, and an understandable
implementation, of known techniques for pattern matching and solving the
sub-graph homomorphism problem. The presented techniques do not include
performance adaptations/optimizations. It is not optimized to be efficient
but rather for the ease of understanding the workings of the algorithms.
The paper does list some possible extensions/optimizations.
It is intended as a guideline, even for novices, and provides an in-depth look
at the workings behind various techniques for efficient pattern matching.
"""
import collections
import itertools
class PatternMatching(object):
"""
Returns an occurrence of a given pattern from the given Graph
"""
def __init__(self, optimize=True):
self.optimize = optimize
def matchNaive(self, pattern, vertices, edges, pattern_vertices=None):
"""
Try to find an occurrence of the pattern in the Graph naively.
"""
# print('matchNaive...')
# print('pattern.vertices:', pattern.vertices)
# print('pattern.edges:', pattern.edges)
# print('vertices:', vertices)
# print('edges:', edges)
# print('pattern_vertices:', pattern_vertices)
# allow call with specific arguments
if pattern_vertices == None:
pattern_vertices = pattern.vertices
def visitEdge(pattern_vertices, p_edge, inc, g_edges, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
# print('visitEdge')
"""
Visit a pattern edge, and try to bind it to a graph edge.
(If the first fails, try the second, and so on...)
"""
for g_edge in g_edges:
# only reckon the edge if its in edges and not visited
# (as the graph might be a subgraph of a more complex graph)
if g_edge not in edges.get(g_edge.type, []) or g_edge in visited_g_edges:
continue
if g_edge.type == p_edge.type and g_edge not in visited_g_edges:
visited_p_edges[p_edge] = g_edge
visited_g_edges.add(g_edge)
if inc:
p_vertex = p_edge.src
else:
p_vertex = p_edge.tgt
if visitVertices(pattern_vertices, p_vertex, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
return True
# remove added edges if they lead to no match, retry with others
del visited_p_edges[p_edge]
visited_g_edges.remove(g_edge)
# no edge leads to a possitive match
return False
def visitEdges(pattern_vertices, p_edges, inc, g_edges, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
# print('visitEdges')
"""
Visit all edges of the pattern vertex (edges given as argument).
We need to try visiting them for all its permutations, as matching
v -e1-> first and v -e2-> second and v -e3-> third, might not result
in a matching an occurrence of the pattern, but matching v -e2->
first and v -e3-> second and v -e1-> third might.
"""
def removePrevEdge(visitedEdges, visited_p_edges, visited_g_edges):
"""
Undo the binding of the brevious edge, (the current bindinds do
not lead to an occurrence of the pattern in the graph).
"""
for wrong_edge in visitedEdges:
# remove binding (pattern edge to graph edge)
wrong_g_edge = visited_p_edges.get(wrong_edge)
del visited_p_edges[wrong_edge]
# remove visited graph edge
visited_g_edges.remove(wrong_g_edge)
for it in itertools.permutations(p_edges):
visitedEdges = []
foundallEdges = True
for edge in it:
if visited_p_edges.get(edge) == None:
if not visitEdge(pattern_vertices, edge, inc, g_edges, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
# this did not work, so we have to undo all added edges
# (the current edge is not added, as it failed)
# we then can try a different permutation
removePrevEdge(visitedEdges, visited_p_edges, visited_g_edges)
foundallEdges = False
break # try other order
# add good visited (we know it succeeded)
visitedEdges.append(edge)
else:
# we visited this pattern edge, and have the coressponding graph edge
# if it is an incoming pattern edge, we need to make sure that
# the graph target that is map from the pattern target
# (of this incoming pattern edge, which has to be bound at this point)
# has the graph adge as an incoming edge,
# otherwise the graph is not properly connected
if inc:
if not visited_p_edges[edge] in visited_p_vertices[edge.tgt].incoming_edges:
# did not work
removePrevEdge(visitedEdges, visited_p_edges, visited_g_edges)
foundallEdges = False
break # try other order
else:
# analog for an outgoing edge
if not visited_p_edges[edge] in visited_p_vertices[edge.src].outgoing_edges:
# did not work
removePrevEdge(visitedEdges, visited_p_edges, visited_g_edges)
foundallEdges = False
break # try other order
# all edges are good, look no further
if foundallEdges:
break
return foundallEdges
def visitVertex(pattern_vertices, p_vertex, g_vertex, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
# print('visitVertex')
"""
Visit a pattern vertex, and try to bind it to the graph vertex
(both are given as argument). A binding is successful if all the
pattern vertex his incoming and outgoing edges can be bound
(to the graph vertex).
"""
if g_vertex in visited_g_vertices:
return False
# save visited graph vertex
visited_g_vertices.add(g_vertex)
# map pattern vertex to visited graph vertex
visited_p_vertices[p_vertex] = g_vertex
if visitEdges(pattern_vertices, p_vertex.incoming_edges, True, g_vertex.incoming_edges, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
if visitEdges(pattern_vertices, p_vertex.outgoing_edges, False, g_vertex.outgoing_edges, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
return True
# cleanup, remove from visited as this does not lead to
# an occurrence of the pttern in the graph
visited_g_vertices.remove(g_vertex)
del visited_p_vertices[p_vertex]
return False
def visitVertices(pattern_vertices, p_vertex, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
# print('visitVertices')
"""
Visit a pattern vertex and try to bind a graph vertex to it.
"""
# if already matched or if it is a vertex not in the pattern_vertices
# (second is for when you want to match the pattern partionally)
if visited_p_vertices.get(p_vertex) != None or p_vertex not in pattern_vertices.get(p_vertex.type, set()):
return True
# try visiting graph vertices of same type as pattern vertex
for g_vertex in vertices.get(p_vertex.type, []):
if g_vertex not in visited_g_vertices:
if visitVertex(pattern_vertices, p_vertex, g_vertex, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
return True
return False
visited_p_vertices = {}
visited_p_edges = {}
visited_g_vertices = set()
visited_g_edges = set()
# for loop is need for when pattern consists of multiple not connected structures
allVertices = []
for _, p_vertices in pattern_vertices.items():
allVertices.extend(p_vertices)
foundIt = False
for it_p_vertices in itertools.permutations(allVertices):
foundIt = True
for p_vertex in it_p_vertices:
if not visitVertices(pattern_vertices, p_vertex, visited_p_vertices, visited_p_edges, visited_g_vertices, visited_g_edges, vertices, edges):
foundIt = False
# reset visited
visited_p_vertices = {}
visited_p_edges = {}
visited_g_vertices = set()
visited_g_edges = set()
break
if foundIt:
break
if foundIt:
return (visited_p_vertices, visited_p_edges)
else:
return None
def createAdjacencyMatrixMap(self, graph, pattern):
"""
Return adjacency matrix and the order of the vertices.
"""
# print('createAdjacencyMatrixMap...')
# print('graph:', graph)
# print('pattern:', pattern)
matrix = collections.OrderedDict() # { vertex, (index, [has edge from index to pos?]) }
# contains all vertices we'll use for the AdjacencyMatrix
allVertices = []
if self.optimize:
# insert only the vertices from the graph which have a type
# that is present in the pattern
for vertex_type, _ in pattern.vertices.items():
graph_vertices = graph.vertices.get(vertex_type)
if graph_vertices != None:
allVertices.extend(graph_vertices)
else:
# we will not be able to find the pattern
# as the pattern contains a vertex of a certain type
# that is not present in the host graph
return False
else:
# insert all vertices from the graph
for _, vertices in graph.vertices.items():
allVertices.extend(vertices)
# create squared zero matrix
index = 0
for vertex in allVertices:
matrix[vertex] = (index, [False] * len(allVertices))
index += 1
for _, edges in graph.edges.items():
for edge in edges:
if self.optimize:
if edge.tgt not in matrix or edge.src not in matrix:
# skip adding edge if the target or source type
# is not present in the pattern
# (and therefor not added to the matrix)
continue
index = matrix[edge.tgt][0]
matrix[edge.src][1][index] = True
AM = []
vertices_order = []
for vertex, row in matrix.items():
AM.append(row[1])
vertices_order.append(vertex)
return AM, vertices_order
def matchVF2(self, pattern, graph):
# print('matchVF2...')
# print('pattern:', pattern)
# print('graph:', graph)
class VF2_Obj(object):
"""
Structor for keeping the VF2 data.
"""
def __init__(self, len_graph_vertices, len_pattern_vertices):
# represents if n-the element (h[n] or p[n]) matched
self.host_vtx_is_matched = [False]*len_graph_vertices
self.pattern_vtx_is_matched = [False]*len_pattern_vertices
# save mapping from pattern to graph
self.mapping = {}
self.edge_mapping = {}
# preference lvl 1
# ordered set of vertices adjecent to M_graph connected via an outgoing edge
self.N_out_graph = [-1]*len_graph_vertices
# ordered set of vertices adjecent to M_pattern connected via an outgoing edge
self.N_out_pattern = [-1]*len_pattern_vertices
# preference lvl 2
# ordered set of vertices adjecent to M_graph connected via an incoming edge
self.N_inc_graph = [-1]*len_graph_vertices
# ordered set of vertices adjecent to M_pattern connected via an incoming edge
self.N_inc_pattern = [-1]*len_pattern_vertices
# preference lvl 3
# not in the above
def findM(H, P, h, p, VF2_obj, index_M=0):
"""
Find an isomorphic mapping for the vertices of P to H.
This mapping is represented by a matrix M if,
and only if M(MH)^T = P^T.
This operates in a simular way as Ullmann. Ullmann has a predefind
order for matching (sorted on most edges first). VF2's order is to
first try to match the adjacency vertices connected via outgoing
edges, then thos connected via incoming edges and then those that
not connected to the currently mathed vertices.
"""
def addOutNeighbours(neighbours, N, index_M):
"""
Given outgoing neighbours (a row from an adjacency matrix),
label them as added by saving when they got added (index_M
represents this, otherwise it is -1)
"""
for neighbour_index in range(0, len(neighbours)):
if neighbours[neighbour_index]:
if N[neighbour_index] == -1:
N[neighbour_index] = index_M
def addIncNeighbours(G, j, N, index_M):
"""
Given the adjacency matrix, and the colum j, representing that
we want to add the incoming edges to vertex j,
label them as added by saving when they got added (index_M
represents this, otherwise it is -1)
"""
for i in range(0, len(G)):
if G[i][j]:
if N[i] == -1:
N[i] = index_M
def delNeighbours(N, index_M):
"""
Remove neighbours that where added at index_M.
If we call this function, we are backtracking and we want to
remove the added neighbours from the just tried matching (n, m)
pair (whiched failed).
"""
for n in range(0, len(N)):
if N[n] == index_M:
N[n] = -1
def feasibilityTest(H, P, h, p, VF2_obj, n, m):
"""
Examine all the nodes connected to n and m; if such nodes are
in the current partial mapping, check if each branch from or to
n has a corresponding branch from or to m and vice versa.
If the nodes and the branches of the graphs being matched also
carry semantic attributes, another condition must also hold for
F(s, n, m) to be true; namely the attributes of the nodes and of
the branches being paired must be compatible.
Another pruning step is to check if the nr of ext_edges between
the matched_vertices from the pattern and its adjecent vertices
are less than or equal to the nr of ext_edges between
matched_vertices from the graph and its adjecent vertices.
And if the nr of ext_edges between those adjecent vertices from
the pattern and the not connected vertices are less than or
equal to the nr of ext_edges between those adjecent vertices from
the graph and its adjecent vertices.
"""
# Get all neighbours from graph node n and pattern node m
# (including n and m)
neighbours_graph = {}
neighbours_graph[h[n].type] = set([h[n]])
neighbours_pattern = {}
neighbours_pattern[p[m].type] = set([p[m]])
# add all neihgbours of pattern vertex m
for i in range(0, len(P)): # P is a nxn-matrix
if (P[m][i] or P[i][m]) and VF2_obj.pattern_vtx_is_matched[i]:
neighbours_pattern.setdefault(p[i].type, set()).add(p[i])
# add all neihgbours of graph vertex n
for i in range(0, len(H)): # P is a nxn-matrix
if (H[n][i] or H[i][n]) and VF2_obj.host_vtx_is_matched[i]:
neighbours_graph.setdefault(h[i].type, set()).add(h[i])
# take a coding shortcut,
# use self.matchNaive function to see if it is feasable.
# this way, we immidiatly test the semantic attributes
# print('pattern.vertices', pattern.vertices)
matched = self.matchNaive(pattern, pattern_vertices=neighbours_pattern, vertices=neighbours_graph, edges=graph.edges)
if matched == None:
return False
# count ext_edges from host_vtx_is_matched to a adjecent vertices and
# cuotn ext_edges for adjecent vertices and not matched vertices
# connected via the ext_edges
ext_edges_graph_ca = 0
ext_edges_graph_an = 0
# for all core vertices
for x in range(0, len(VF2_obj.host_vtx_is_matched)):
# for all its neighbours
for y in range(0, len(H)):
if H[x][y]:
# if it is a neighbor and not yet matched
if (VF2_obj.N_out_graph[y] != -1 or VF2_obj.N_inc_graph[y] != -1) and VF2_obj.host_vtx_is_matched[y]:
# if we matched it
if VF2_obj.host_vtx_is_matched[x] != -1:
ext_edges_graph_ca += 1
else:
ext_edges_graph_an += 1
# count ext_edges from pattern_vtx_is_matched to a adjecent vertices
# connected via the ext_edges
ext_edges_pattern_ca = 0
ext_edges_pattern_an = 0
# for all core vertices
for x in range(0, len(VF2_obj.pattern_vtx_is_matched)):
# for all its neighbours
for y in range(0, len(P)):
if P[x][y]:
# if it is a neighbor and not yet matched
if (VF2_obj.N_out_pattern[y] != -1 or VF2_obj.N_inc_pattern[y] != -1) and VF2_obj.pattern_vtx_is_matched[y]:
# if we matched it
if VF2_obj.pattern_vtx_is_matched[x] != -1:
ext_edges_pattern_ca += 1
else:
ext_edges_pattern_an += 1
# The nr of ext_edges between matched_vertices from the pattern
# and its adjecent vertices must be less than or equal to the nr
# of ext_edges between matched_vertices from the graph and its
# adjecent vertices, otherwise we wont find an occurrence
if ext_edges_pattern_ca > ext_edges_graph_ca:
return False
# The nr of ext_edges between those adjancent vertices from the
# pattern and its not connected vertices must be less than or
# equal to the nr of ext_edges between those adjacent vertices
# from the graph and its not connected vertices,
# otherwise we wont find an occurrence
if ext_edges_pattern_an > ext_edges_graph_an:
return False
return matched
def matchPhase(index_M, VF2_obj, n, m):
"""
The matching fase of the VF2 algorithm. If the chosen n, m pair
passes the feasibilityTest, the pair gets added and we start
to search for the next matching pair.
"""
# all candidate pair (n, m) represent graph x pattern
candidate = frozenset(itertools.chain(
((i, j) for i,j in VF2_obj.mapping.items()),
# ((self.reverseMapH[i], self.reverseMapP[j]) for i,j in VF2_obj.mapping.items()),
[(h[n],p[m])],
))
if candidate in self.alreadyVisited:
# print(self.indent*" ", "candidate:", candidate)
# for match in self.alreadyVisited.get(index_M, []):
# if match == candidate:
return False # already visited this (partial) match -> skip
matched = feasibilityTest(H, P, h, p, VF2_obj, n, m)
if matched != False:
# print(self.indent*" ","adding to match:", n, "->", m)
# adapt VF2_obj
VF2_obj.host_vtx_is_matched[n] = True
VF2_obj.pattern_vtx_is_matched[m] = True
VF2_obj.mapping[h[n]] = p[m]
# VF2_obj.edge_mapping
addOutNeighbours(H[n], VF2_obj.N_out_graph, index_M)
addIncNeighbours(H, n, VF2_obj.N_inc_graph, index_M)
addOutNeighbours(P[m], VF2_obj.N_out_pattern, index_M)
addIncNeighbours(P, m, VF2_obj.N_inc_pattern, index_M)
if index_M > 0:
# remember our partial match (shallow copy) so we don't visit it again
self.alreadyVisited.add(frozenset([ (i, j) for i,j in VF2_obj.mapping.items()]))
# self.alreadyVisited.setdefault(index_M, set()).add(frozenset([ (self.reverseMapH[i], self.reverseMapP[j]) for i,j in VF2_obj.mapping.items()]))
# print(self.alreadyVisited)
self.indent += 1
matched = yield from findM(H, P, h, p, VF2_obj, index_M + 1)
if matched:
# return True
# print(self.indent*" ","found match", len(self.results), ", continuing...")
pass
self.indent -= 1
if True:
# else:
# print(self.indent*" ","backtracking... remove", n, "->", m)
# else, backtrack, adapt VF2_obj
VF2_obj.host_vtx_is_matched[n] = False
VF2_obj.pattern_vtx_is_matched[m] = False
del VF2_obj.mapping[h[n]]
delNeighbours(VF2_obj.N_out_graph, index_M)
delNeighbours(VF2_obj.N_inc_graph, index_M)
delNeighbours(VF2_obj.N_out_pattern, index_M)
delNeighbours(VF2_obj.N_inc_pattern, index_M)
return False
def preferred(index_M, VF2_obj, N_graph, N_pattern):
"""
Try to match the adjacency vertices connected via outgoing
or incoming edges. (Depending on what is given for N_graph and
N_pattern.)
"""
for n in range(0, len(N_graph)):
# skip graph vertices that are not in VF2_obj.N_out_graph
# (or already matched)
if N_graph[n] == -1 or VF2_obj.host_vtx_is_matched[n]:
# print(self.indent*" "," skipping")
continue
# print(self.indent*" "," n:", n)
for m in range(0, len(N_pattern)):
# skip graph vertices that are not in VF2_obj.N_out_pattern
# (or already matched)
if N_pattern[m] == -1 or VF2_obj.pattern_vtx_is_matched[m]:
continue
# print(self.indent*" "," m:", m)
matched = yield from matchPhase(index_M, VF2_obj, n, m)
if matched:
return True
return False
def leastPreferred(index_M, VF2_obj):
"""
Try to match the vertices that are not connected to the curretly
matched vertices.
"""
for n in range(0, len(VF2_obj.N_out_graph)):
# skip vertices that are connected to the graph
# (or already matched)
if not (VF2_obj.N_out_graph[n] == -1 and VF2_obj.N_inc_graph[n] == -1) or VF2_obj.host_vtx_is_matched[n]:
# print(self.indent*" "," skipping")
continue
# print(" n:", n)
for m in range(0, len(VF2_obj.N_out_pattern)):
# skip vertices that are connected to the graph
# (or already matched)
if not (VF2_obj.N_out_pattern[m] == -1 and VF2_obj.N_inc_pattern[m] == -1) or VF2_obj.pattern_vtx_is_matched[m]:
# print(self.indent*" "," skipping")
continue
# print(self.indent*" "," m:", m)
matched = yield from matchPhase(index_M, VF2_obj, n, m)
if matched:
return True
return False
# print(self.indent*" ","index_M:", index_M)
# We are at the end, we found an candidate.
if index_M == len(p):
# print(self.indent*" ","end...")
bound_graph_vertices = {}
for vertex_bound, _ in VF2_obj.mapping.items():
bound_graph_vertices.setdefault(vertex_bound.type, set()).add(vertex_bound)
result = self.matchNaive(pattern, vertices=bound_graph_vertices, edges=graph.edges)
if result != None:
yield result
return result != None
if index_M > 0:
# try the candidates is the preffered order
# first try the adjacent vertices connected via the outgoing edges.
# print(self.indent*" ","preferred L1")
matched = yield from preferred(index_M, VF2_obj, VF2_obj.N_out_graph, VF2_obj.N_out_pattern)
if matched:
return True
# print(self.indent*" ","preferred L2")
# then try the adjacent vertices connected via the incoming edges.
matched = yield from preferred(index_M, VF2_obj, VF2_obj.N_inc_graph, VF2_obj.N_inc_pattern)
if matched:
return True
# print(self.indent*" ","leastPreferred")
# and lastly, try the vertices not connected to the currently matched vertices
matched = yield from leastPreferred(index_M, VF2_obj)
if matched:
return True
return False
# create adjacency matrix of the graph
H, h = self.createAdjacencyMatrixMap(graph, pattern)
# create adjacency matrix of the pattern
P, p = self.createAdjacencyMatrixMap(pattern, pattern)
VF2_obj = VF2_Obj(len(h), len(p))
# Only for debugging:
self.indent = 0
self.reverseMapH = { h[i] : i for i in range(len(h))}
self.reverseMapP = { p[i] : i for i in range(len(p))}
# Set of partial matches already explored - prevents us from producing the same match multiple times
# Encoded as a mapping from match size to the partial match
self.alreadyVisited = set()
yield from findM(H, P, h, p, VF2_obj)