muMLE/pattern_matching/planGraph.py

528 lines
No EOL
20 KiB
Python

# coding: utf-8
"""
Author: Sten Vercamman
Univeristy of Antwerp
Example code for paper: Efficient model transformations for novices
url: http://msdl.cs.mcgill.ca/people/hv/teaching/MSBDesign/projects/Sten.Vercammen
The main goal of this code is to give an overview, and an understandable
implementation, of known techniques for pattern matching and solving the
sub-graph homomorphism problem. The presented techniques do not include
performance adaptations/optimizations. It is not optimized to be efficient
but rather for the ease of understanding the workings of the algorithms.
The paper does list some possible extensions/optimizations.
It is intended as a guideline, even for novices, and provides an in-depth look
at the workings behind various techniques for efficient pattern matching.
"""
from searchGraph import *
from enum import *
# Enum for all primitive operation types
# note: inc represent primitive operation in (as in is a reserved keyword in python)
PRIM_OP = Enum(['lkp', 'inc', 'out', 'src', 'tgt'])
class PlanGraph(object):
"""
Holds the PlanGraph for a pattern.
Can create the search plan of the pattern for a given SearchGraph.
"""
def __init__(self, pattern):
if not isinstance(pattern, Graph):
raise TypeError('PlanGraph expects the pattern to be a Graph')
# member variables:
self.vertices = [] # will not be searched in
self.edges = [] # will not be searched in
# representation map, maps vertex from pattern to element from PlanGraph
# (no need for edges)
repr_map = {}
# 1.1: for every vertex in the pattern graph,
# create a vertex representing the pattern element
for str_type, vertices in pattern.vertices.items():
for vertex in vertices:
# we only need to know the type of the vertex
plan_vertex = Vertex(str_type)
# and we need to know that is was a vertex
plan_vertex.is_vertex = True
# for re-linking the edges, we'll need to map the
# vertex of the pattern to the plan_vertex
repr_map[vertex] = plan_vertex
# save created plan_vertex
self.vertices.append(plan_vertex)
# 1.2: for every edge in the pattern graph,
# create a vertex representing the pattern elemen
for str_type, edges in pattern.edges.items():
for edge in edges:
# we only need to know the type of the edge
plan_vertex = Vertex(edge.type)
# and we need to know that is was an edge
plan_vertex.is_vertex = False
# save created plan_vertex
self.vertices.append(plan_vertex)
# 4: for every element x from the PlanGraph
# that represents an edge e in the pattern:
# 4.1: create an edge labelled tgt from x to the vertex in the PlanGraph
# representing the target vertex of e in the pattern graph,
# and a reverted edge labelled in
# 4.1.1: tgt:
plan_edge = Edge(plan_vertex, repr_map[edge.tgt])
# backup src and tgt (Edmonds might override it)
plan_edge.orig_src = plan_edge.src
plan_edge.orig_tgt = plan_edge.tgt
plan_edge.label = PRIM_OP.tgt
# link vertices connected to this plan_edge
plan_edge.src.addOutgoingEdge(plan_edge)
plan_edge.tgt.addIncomingEdge(plan_edge)
# tgt and src cost are always 1, we use logaritmic cost,
# (=> cost = ln(1) = 0.0) so that we do not need to minimaze
# a product, but can minimize a sum
# (as ln(c1...ck) = ln(c1) + ... + ln (ck))
plan_edge.cost = 0.0
# backup orig cost, as Edmonds changes cost
plan_edge.orig_cost = plan_edge.cost
# save created edge
self.edges.append(plan_edge)
# 4.1.2: in:
plan_edge = Edge(repr_map[edge.tgt], plan_vertex)
# backup src and tgt (Edmonds might override it)
plan_edge.orig_src = plan_edge.src
plan_edge.orig_tgt = plan_edge.tgt
plan_edge.label = PRIM_OP.inc
# link vertices connected to this plan_edge
plan_edge.src.addOutgoingEdge(plan_edge)
plan_edge.tgt.addIncomingEdge(plan_edge)
# save created edge
self.edges.append(plan_edge)
# 4.2: create an edge labelled src from x to the vertex in the PlanGraph
# representing the source vertex of e in the pattern graph
# and a reverted edge labelled out
# 4.2.1: src
plan_edge = Edge(plan_vertex, repr_map[edge.src])
# backup src and tgt (Edmonds might override it)
plan_edge.orig_src = plan_edge.src
plan_edge.orig_tgt = plan_edge.tgt
plan_edge.label = PRIM_OP.src
# link vertices connected to this plan_edge
plan_edge.src.addOutgoingEdge(plan_edge)
plan_edge.tgt.addIncomingEdge(plan_edge)
# tgt and src cost are always 1, we use logaritmic cost,
# (=> cost = ln(1) = 0.0) so that we do not need to minimaze
# a product, but can minimize a sum
# (as ln(c1...ck) = ln(c1) + ... + ln (ck))
plan_edge.cost = 0.0
# backup orig cost, as Edmonds changes cost
plan_edge.orig_cost = plan_edge.cost
# save created edge
self.edges.append(plan_edge)
# 4.2.2: out
plan_edge = Edge(repr_map[edge.src], plan_vertex)
# backup src and tgt (Edmonds might override it)
plan_edge.orig_src = plan_edge.src
plan_edge.orig_tgt = plan_edge.tgt
plan_edge.label = PRIM_OP.out
# link vertices connected to this plan_edge
plan_edge.src.addOutgoingEdge(plan_edge)
plan_edge.tgt.addIncomingEdge(plan_edge)
# save created edge
self.edges.append(plan_edge)
# 2: create a root vertex
self.root = Vertex('root')
# don't add it to the vertices
# 3: for each element in the PlanGraph (that is not the root vertex),
# create an edge from the root to it, and label it lkp
for vertex in self.vertices:
plan_edge = Edge(self.root, vertex)
# backup src and tgt (Edmonds might override it)
plan_edge.orig_src = plan_edge.src
plan_edge.orig_tgt = plan_edge.tgt
plan_edge.label = PRIM_OP.lkp
# link vertices connected to this plan_edge
plan_edge.src.addOutgoingEdge(plan_edge)
plan_edge.tgt.addIncomingEdge(plan_edge)
# save created edge
self.edges.append(plan_edge)
def updatePlanCost(self, graph):
"""
returns True if sucessfully updated cost,
returns False if a type in the pattern is not in the graph.
"""
if not isinstance(graph, SearchGraph):
raise TypeError('updatePlanCost expects a SearchGraph')
# update, lkp, in and out (not src and tgt as they are constant)
for edge in self.edges:
if edge.label == PRIM_OP.lkp:
edge.cost = graph.getCostLkp(edge.tgt.type, edge.tgt.is_vertex)
if edge.cost == None:
print('failed lkp')
return False
elif edge.label == PRIM_OP.inc:
# in(v, e), binds an incoming edge e from an already bound vertex v,
# depends on the number of incoming edges of type e for the vertex type
edge.cost = graph.getCostInc(edge.src.type, edge.tgt.type)
if edge.cost == None:
print('failed in')
return False
elif edge.label == PRIM_OP.out:
# (analogue for out(v, e))
edge.cost = graph.getCostOut(edge.src.type, edge.tgt.type)
if edge.cost == None:
print('failed out')
return False
# else: ignore src and tgt
# backup orig cost, as Edmonds changes cost
edge.orig_cost = edge.cost
return True
def Edmonds(self, searchGraph):
"""
Returns the minimum directed spanning tree (MDST)
for the pattern and the provided graph.
Returns None if it is impossible to find the pattern in the Graph
(vertex type of edge type from pattern not in Graph).
"""
# update the cost for the PlanGraph
if not self.updatePlanCost(searchGraph):
print('type in pattern not found in Graph (in Edmonds)')
# (returns False if a type in the pattern can not be found in the graph)
return None
# Complete Edmonds algorithm has optimization steps:
# a: remove edges entering the root
# b: merge parallel edges from same src to same tgt with mim weight
# we can ignore this as:
# a: the root does not have incoming edges
# b: the PlanGraph does not have such paralllel edges
# 1: for each node v (other than root), find incoming edge with lowest weight
# insert those
pi_v = {}
for plan_vertex in self.vertices:
min_weight = float('infinity')
min_edge = None
for plan_edge in plan_vertex.incoming_edges:
if plan_edge.cost < min_weight:
min_weight = plan_edge.cost
min_edge = plan_edge
# save plan_vertex and it's minimum incoming edge
pi_v[plan_vertex] = min_edge
if min_edge == None:
raise RuntimeError('baka: no min_edge found')
def getCycle(vertex, reverse_graph, visited):
"""
Walk from vertex to root, we walk in a reverse order, as each vertex
only has one incoming edge, so we walk to the source of that incoming
edge. We stop when we already visited a vertex we walked on.
In both cases we return None.
When we visit a vertex from our current path, we return that cycle,
by first removing its tail.
"""
def addToVisited(walked, visited):
for vertex in walked:
visited.add(vertex)
walked = [] # we could only save it once, but we need order
current_path = set() # and lookup in an array is slower than in set
# we asume root is in visited (it must be in it)
while vertex not in visited:
if vertex in current_path:
# we found a cycle, the cycle however might look like a: O--,
# g f e where we first visited a, then b, c, d,...
# h d c b a k points back to d, completing a cycle,
# i j k but c b a is the tail that does not belong
# in the cycle, removing this is "easy" as we know that
# we first visited the tail, so they are the first elements
# in our walked path
for tail_part in walked:
if tail_part != vertex:
current_path.remove(tail_part)
else:
break
addToVisited(walked, visited)
return current_path
current_path.add(vertex)
walked.append(vertex)
# by definition, an MDST only has one incoming edge per vertex
# so we follow it upwards
# vertex <--(minimal edge)-- src
vertex = reverse_graph[vertex].src
# no cycle found (the current path let to a visited vertex)
addToVisited(walked, visited) # add walked to visited
return None
class VertexGraph(Vertex):
"""
Acts as a super vertex, holds a subgraph (that is/was once a cyle).
Uses for Edmonds contractions step.
The incoming edges are the edges leading to the vertices in the
VertexGraph (they exclude edges from a vertex in the cycle to
another vertex in the cycle).
Analogue for outgoing edges.
"""
def __init__(self, cycle, reverseGraph):
# Call parent class constructor
str_type = ''
for vertex in cycle:
str_type += str(vertex.type)
Vertex.__init__(self, str_type)
# member variables:
self.internalMDST = {}
minIntWeight = self.findMinIntWeight(cycle, reverseGraph)
self.updateMinExtEdge(minIntWeight, reverseGraph)
def findMinIntWeight(self, cycle, reverseGraph):
"""
Find the the smallest cost of the cycle his internal incoming edges.
(Also save its internalMDST (currently a cycle).)
(The VertexGraph formed by the cycle will be added to the
reverseGraph by calling findMinExtEdge.)
"""
minIntWeight = float('infinity')
cycleEdges = []
origTgts = []
for cyclePart in cycle:
cycleEdges.append(reverseGraph[cyclePart])
origTgts.append(reverseGraph[cyclePart].orig_tgt)
for vertex in cycle:
# add incoming edges to this VertexGraph
for inc_edge in vertex.incoming_edges:
# edge from within the cycle
if inc_edge.src in cycle:
minIntWeight = min(minIntWeight, inc_edge.cost)
else:
# edge from outside the cycle
self.addIncomingEdge(inc_edge)
# add outgoing edges to this VertexGraph
for out_edge in vertex.outgoing_edges:
if out_edge.tgt not in cycle:
# edge leaves the cycle
self.addOutgoingEdge(out_edge)
# update src to this VertexGraph
out_edge.src = self
# save internal MDST
min_edge = reverseGraph[vertex]
if min_edge.src in cycle:
self.internalMDST[vertex] = min_edge
else:
raise TypeError('how is this a cycle')
return minIntWeight
def updateMinExtEdge(self, minIntWeight, reverseGraph):
"""
Modifies all external incoming edges their cost and finds the
minimum external incoming edge with this modified weight.
This found edge will break the cycle, update the internalMDST
from a cycle to an MDST, updates the reverseGraph to include
the vertexGraph.
"""
minExt = None
minModWeight = -float('infinity')
# Find incoming edge from outside of the circle with minimal
# modified cost. This edge will break the cycle.
for inc_edge in self.incoming_edges:
# An incoming edge (with src from within the cycle), can be
# from a contracted part of the graph. Assume bc is a
# contracted part (VertexGraph) a, bc is a newly formed
# cycle (due to the breaking of the previous cycle bc). bc
# has at least lkp incoming edges to b and c, but we should
# not consider the lkp of c to break the cycle.
# If we want to break a, bc, select plausable edges,
# /<--\
# a bc bc's MDST b <-- c
# \-->/
# by looking at their original targets.
# (if cycle inc_edge.orig_tgt == external inc_edge.orig_tgt)
if reverseGraph[inc_edge.tgt].orig_tgt == inc_edge.orig_tgt:
# modify costL cost of inc_edge -
# (cost of previously choosen minimum edge to cycle vertex - minIntWeight)
inc_edge.cost -= (reverseGraph[inc_edge.tgt].cost - minIntWeight)
if minExt is None or minModWeight > inc_edge.cost:
# save better edge from outside of the cycle
minExt = inc_edge
minModWeight = inc_edge.cost
# Example: a, b is a cycle (we know that there are no other
# incoming edges to a and/or b, as there is on;y exactly one
# incoming edge per vertex), and the arow from c to b represents
# the minExt edge. We will remove the bottem arrow (from a to b)
# /<--\ and save the minExt edge in the reverseGraph.
# a b <-- c This breaks the cycle. As the internalMDST
# \-->/ saves the intenal MDST, and currently still
# holds a cycle, we have to remove it from the internalMDST.
# We have to remove all vertex bindings of the cycle from the
# reverseGraph (as it is contracted into a single VertexGraph),
# and store the minExt edge to this VertexGraph in it.
for int_vertex, _ in self.internalMDST.items():
del reverseGraph[int_vertex] # remove cycle from reverseGraph
del self.internalMDST[minExt.tgt] # remove/break cycle
for inc_edge in self.incoming_edges:
# update inc_edge's target to this VertexGraph
inc_edge.tgt = self
# save minExt edge to this VertexGraph in the reverseGraph
reverseGraph[self] = minExt
while True:
# 2: find all cycles:
cycles = []
visited = set([self.root]) # root does not have incoming edges,
for vertex in list(pi_v.keys()): # it can not be part of a cycle
if vertex not in visited: # getCycle depends on root being in visited
cycle = getCycle(vertex, pi_v, visited)
if cycle != None:
cycles.append(cycle)
# 2: if the set of edges {pi(v), v} does not contain any cycles,
# Then we found our minimum directed spanning tree
# otherwise, we'll have to resolve the cycles
if len(cycles) == 0:
break
# 3: For each formed cycle:
# 3a: find internal incoming edge with the smallest cost
# 3b: modify the cost of each arc which enters the cycle
# 3c: replace smallert internal edge with the modified edge which has the smallest cost
for cycle in cycles:
# Breaks a cycle by:
# - contracting cycle into VertexGraph
# - finding the internal incoming edge with the smallest cost
# - modify the cost of each arc which enters the cycle
# - replacing the smallest internal edge with the modified edge which has the smallest cost
# - changing reverseGraph accordingly (removes elements from cycle, ads vertexGraph)
# (This will find a solution as the graph keeps shrinking with every cycle,
# in the worst case the same amount as there are vertices, until
# onlty the root and one vertexGraph remains)
vertexGraph = VertexGraph(cycle, pi_v)
class SortedContainer(object):
"""
A container that keeps elemets sorted based on a given sortValue.
Elements with the same value, will be returned in the order they got inserted.
"""
def __init__(self):
# member variables:
self.keys = [] # stores key in sorted order (sorted when pop gets called)
self.sorted = {} # {key, [elems with same key]}
def add(self, sortValue, element):
"""
Adds element with sortValue to the SortedContainer.
"""
elems = self.sorted.get(sortValue)
if elems == None:
self.sorted[sortValue] = [element]
self.keys.append(sortValue)
else:
elems.append(element)
def pop(self):
"""
Sorts the SortedContainer, returns element with smallest sortValue.
"""
self.keys.sort()
elems = self.sorted[self.keys[0]]
elem = elems.pop()
if len(elems) == 0:
del self.sorted[self.keys[0]]
del self.keys[0]
return elem
def empty(self):
"""
Returns whether or not the sorted container is empty.
"""
return (len(self.keys) == 0)
def createPRIM_OP(edge, inc_cost=True):
"""
Helper function to keep argument list short,
return contracted data for a PRIM_OP.
"""
if edge.label == PRIM_OP.inc or edge.label == PRIM_OP.out:
if inc_cost: # op # vertex type # actual edge type
return (edge.label, edge.orig_src.type, edge.orig_tgt.type, edge.cost)
else:
return (edge.label, edge.orig_src.type, edge.orig_tgt.type)
elif edge.label == PRIM_OP.lkp:
if inc_cost: # op # vertex/edge type # is vertex or edge
return (edge.label, edge.orig_tgt.type, edge.orig_tgt.is_vertex, edge.cost)
else:
return (edge.label, edge.orig_tgt.type, edge.orig_tgt.is_vertex)
else: # src, tgt operation
if inc_cost: # op # actual edge type
return (edge.label, edge.orig_src.type, edge.cost)
else:
return (edge.label, edge.orig_src.type)
def flattenReverseGraph(vertex, inc_edge, reverseGraph):
"""
Flattens the reverseGraph, so that the vertexGraph node can get
processed to create a forwardGraph.
"""
if not isinstance(vertex, VertexGraph):
reverseGraph[vertex] = inc_edge
else:
reverseGraph[inc_edge.orig_tgt] = inc_edge
for vg, eg in inc_edge.tgt.internalMDST.items():
flattenReverseGraph(vg, eg, reverseGraph)
if isinstance(inc_edge.src, VertexGraph):
for vg, eg in inc_edge.src.internalMDST.items():
flattenReverseGraph(vg, eg, reverseGraph)
def createForwardGraph(vertex, inc_edge, forwardGraph):
"""
Create a forwardGraph, keeping in mind that their can be vertexGraph
in the reverseGraph.
"""
if not isinstance(vertex, VertexGraph):
forwardGraph.setdefault(inc_edge.orig_src, []).append(inc_edge)
else:
forwardGraph.setdefault(inc_edge.orig_src, []).append(inc_edge)
for vg, eg in vertex.internalMDST.items():
createForwardGraph(vg, eg, forwardGraph)
MDST = []
# pi_v contains {vertex, incoming_edge}
# we want to start from root and follow the outgoing edges
# so we have to build the forwardGraph graph for pi_v
# (Except for the root (has 0), each vertex has exactly one incoming edge,
# but might have multiple outgoing edges)
forwardGraph = {} # {vertex, [outgoing edge 1, ... ] }
reverseGraph = {}
# flatten reverseGraph (for the vertexGraph elements)
for v, e in pi_v.items():
flattenReverseGraph(v, e, reverseGraph)
# create the forwardGraph
for vertex, edge in reverseGraph.items():
createForwardGraph(vertex, edge, forwardGraph)
# create the MDST in a best first manner (lowest value first)
current = SortedContainer() # allows easy walking true tree
for edge in forwardGraph[self.root]:
current.add(edge.orig_cost, edge) # use orig cost, not modified
while current.empty() != True:
p_op = current.pop() # p_op contains an outgoing edge
MDST.append(createPRIM_OP(p_op))
for edge in forwardGraph.get(p_op.orig_tgt, []):
current.add(edge.orig_cost, edge)
return MDST